317 files changed, 10956 insertions, 14623 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 795233702a4e..7e0511476797 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -17,3 +17,16 @@ config 9P_FSCACHE
          Choose Y here to enable persistent, read-only local
          caching support for 9p clients using FS-Cache
+config 9P_FS_POSIX_ACL
+        bool "9P POSIX Access Control Lists"
+        depends on 9P_FS
+        select FS_POSIX_ACL
+        help
+          POSIX Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
+          To learn more about Access Control Lists, visit the POSIX ACLs for
+          Linux website <http://acl.bestbits.at/>.
+          If you don't know what Access Control Lists are, say N
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 91fba025fcbe..f8ba37effd1b 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_9P_FS) := 9p.o
        xattr_user.o
 9p-$(CONFIG_9P_FSCACHE) += cache.o
+9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
new file mode 100644
index 000000000000..12d602351dbe
--- /dev/null
+++ b/fs/9p/acl.c
@@ -0,0 +1,392 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/posix_acl_xattr.h>
+#include "xattr.h"
+#include "acl.h"
+#include "v9fs_vfs.h"
+#include "v9fs.h"
+static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
+{
+        ssize_t size;
+        void *value = NULL;
+        struct posix_acl *acl = NULL;;
+        size = v9fs_fid_xattr_get(fid, name, NULL, 0);
+        if (size > 0) {
+                value = kzalloc(size, GFP_NOFS);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                size = v9fs_fid_xattr_get(fid, name, value, size);
+                if (size > 0) {
+                        acl = posix_acl_from_xattr(value, size);
+                        if (IS_ERR(acl))
+                                goto err_out;
+                }
+        } else if (size == -ENODATA || size == 0 ||
+                   size == -ENOSYS || size == -EOPNOTSUPP) {
+                acl = NULL;
+        } else
+                acl = ERR_PTR(-EIO);
+err_out:
+        kfree(value);
+        return acl;
+}
+int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+        int retval = 0;
+        struct posix_acl *pacl, *dacl;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+                set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
+                set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
+                return 0;
+        }
+        /* get the default/access acl values and cache them */
+        dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
+        pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
+        if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
+                set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
+                set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
+                posix_acl_release(dacl);
+                posix_acl_release(pacl);
+        } else
+                retval = -EIO;
+        return retval;
+}
+static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
+{
+        struct posix_acl *acl;
+        /*
+         * 9p Always cache the acl value when
+         * instantiating the inode (v9fs_inode_from_fid)
+         */
+        acl = get_cached_acl(inode, type);
+        BUG_ON(acl == ACL_NOT_CACHED);
+        return acl;
+}
+int v9fs_check_acl(struct inode *inode, int mask)
+{
+        struct posix_acl *acl;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+                /*
+                 * On access = client mode get the acl
+                 * values from the server
+                 */
+                return 0;
+        }
+        acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                int error = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                return error;
+        }
+        return -EAGAIN;
+}
+static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
+{
+        int retval;
+        char *name;
+        size_t size;
+        void *buffer;
+        struct inode *inode = dentry->d_inode;
+        set_cached_acl(inode, type, acl);
+        /* Set a setxattr request to server */
+        size = posix_acl_xattr_size(acl->a_count);
+        buffer = kmalloc(size, GFP_KERNEL);
+        if (!buffer)
+                return -ENOMEM;
+        retval = posix_acl_to_xattr(acl, buffer, size);
+        if (retval < 0)
+                goto err_free_out;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name = POSIX_ACL_XATTR_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                name = POSIX_ACL_XATTR_DEFAULT;
+                break;
+        default:
+                BUG();
+        }
+        retval = v9fs_xattr_set(dentry, name, buffer, size, 0);
+err_free_out:
+        kfree(buffer);
+        return retval;
+}
+int v9fs_acl_chmod(struct dentry *dentry)
+{
+        int retval = 0;
+        struct posix_acl *acl, *clone;
+        struct inode *inode = dentry->d_inode;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+        if (acl) {
+                clone = posix_acl_clone(acl, GFP_KERNEL);
+                posix_acl_release(acl);
+                if (!clone)
+                        return -ENOMEM;
+                retval = posix_acl_chmod_masq(clone, inode->i_mode);
+                if (!retval)
+                        retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone);
+                posix_acl_release(clone);
+        }
+        return retval;
+}
+int v9fs_set_create_acl(struct dentry *dentry,
+                        struct posix_acl *dpacl, struct posix_acl *pacl)
+{
+        if (dpacl)
+                v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
+        if (pacl)
+                v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
+        posix_acl_release(dpacl);
+        posix_acl_release(pacl);
+        return 0;
+}
+int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+                  struct posix_acl **dpacl, struct posix_acl **pacl)
+{
+        int retval = 0;
+        mode_t mode = *modep;
+        struct posix_acl *acl = NULL;
+        if (!S_ISLNK(mode)) {
+                acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                if (!acl)
+                        mode &= ~current_umask();
+        }
+        if (acl) {
+                struct posix_acl *clone;
+                if (S_ISDIR(mode))
+                        *dpacl = acl;
+                clone = posix_acl_clone(acl, GFP_NOFS);
+                retval = -ENOMEM;
+                if (!clone)
+                        goto cleanup;
+                retval = posix_acl_create_masq(clone, &mode);
+                if (retval < 0) {
+                        posix_acl_release(clone);
+                        goto cleanup;
+                }
+                if (retval > 0)
+                        *pacl = clone;
+        }
+        *modep  = mode;
+        return 0;
+cleanup:
+        posix_acl_release(acl);
+        return retval;
+}
+static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
+                               void *buffer, size_t size, int type)
+{
+        char *full_name;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                full_name =  POSIX_ACL_XATTR_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                full_name = POSIX_ACL_XATTR_DEFAULT;
+                break;
+        default:
+                BUG();
+        }
+        return v9fs_xattr_get(dentry, full_name, buffer, size);
+}
+static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
+                              void *buffer, size_t size, int type)
+{
+        struct v9fs_session_info *v9ses;
+        struct posix_acl *acl;
+        int error;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        /*
+         * We allow set/get/list of acl when access=client is not specified
+         */
+        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+                return v9fs_remote_get_acl(dentry, name, buffer, size, type);
+        acl = v9fs_get_cached_acl(dentry->d_inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        error = posix_acl_to_xattr(acl, buffer, size);
+        posix_acl_release(acl);
+        return error;
+}
+static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
+                              const void *value, size_t size,
+                              int flags, int type)
+{
+        char *full_name;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                full_name =  POSIX_ACL_XATTR_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                full_name = POSIX_ACL_XATTR_DEFAULT;
+                break;
+        default:
+                BUG();
+        }
+        return v9fs_xattr_set(dentry, full_name, value, size, flags);
+}
+static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
+                              const void *value, size_t size,
+                              int flags, int type)
+{
+        int retval;
+        struct posix_acl *acl;
+        struct v9fs_session_info *v9ses;
+        struct inode *inode = dentry->d_inode;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        /*
+         * set the attribute on the remote. Without even looking at the
+         * xattr value. We leave it to the server to validate
+         */
+        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+                return v9fs_remote_set_acl(dentry, name,
+                                           value, size, flags, type);
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (!is_owner_or_cap(inode))
+                return -EPERM;
+        if (value) {
+                /* update the cached acl value */
+                acl = posix_acl_from_xattr(value, size);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                else if (acl) {
+                        retval = posix_acl_valid(acl);
+                        if (retval)
+                                goto err_out;
+                }
+        } else
+                acl = NULL;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name = POSIX_ACL_XATTR_ACCESS;
+                if (acl) {
+                        mode_t mode = inode->i_mode;
+                        retval = posix_acl_equiv_mode(acl, &mode);
+                        if (retval < 0)
+                                goto err_out;
+                        else {
+                                struct iattr iattr;
+                                if (retval == 0) {
+                                        /*
+                                         * ACL can be represented
+                                         * by the mode bits. So don't
+                                         * update ACL.
+                                         */
+                                        acl = NULL;
+                                        value = NULL;
+                                        size = 0;
+                                }
+                                /* Updte the mode bits */
+                                iattr.ia_mode = ((mode & S_IALLUGO) |
+                                                 (inode->i_mode & ~S_IALLUGO));
+                                iattr.ia_valid = ATTR_MODE;
+                                /* FIXME should we update ctime ?
+                                 * What is the following setxattr update the
+                                 * mode ?
+                                 */
+                                v9fs_vfs_setattr_dotl(dentry, &iattr);
+                        }
+                }
+                break;
+        case ACL_TYPE_DEFAULT:
+                name = POSIX_ACL_XATTR_DEFAULT;
+                if (!S_ISDIR(inode->i_mode)) {
+                        retval = -EINVAL;
+                        goto err_out;
+                }
+                break;
+        default:
+                BUG();
+        }
+        retval = v9fs_xattr_set(dentry, name, value, size, flags);
+        if (!retval)
+                set_cached_acl(inode, type, acl);
+err_out:
+        posix_acl_release(acl);
+        return retval;
+}
+const struct xattr_handler v9fs_xattr_acl_access_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_ACCESS,
+        .get    = v9fs_xattr_get_acl,
+        .set    = v9fs_xattr_set_acl,
+};
+const struct xattr_handler v9fs_xattr_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
+        .get    = v9fs_xattr_get_acl,
+        .set    = v9fs_xattr_set_acl,
+};
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
new file mode 100644
index 000000000000..59e18c2e8c7e
--- /dev/null
+++ b/fs/9p/acl.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#ifndef FS_9P_ACL_H
+#define FS_9P_ACL_H
+#ifdef CONFIG_9P_FS_POSIX_ACL
+extern int v9fs_get_acl(struct inode *, struct p9_fid *);
+extern int v9fs_check_acl(struct inode *inode, int mask);
+extern int v9fs_acl_chmod(struct dentry *);
+extern int v9fs_set_create_acl(struct dentry *,
+                               struct posix_acl *, struct posix_acl *);
+extern int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+                         struct posix_acl **dpacl, struct posix_acl **pacl);
+#else
+#define v9fs_check_acl NULL
+static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+        return 0;
+}
+static inline int v9fs_acl_chmod(struct dentry *dentry)
+{
+        return 0;
+}
+static inline int v9fs_set_create_acl(struct dentry *dentry,
+                                      struct posix_acl *dpacl,
+                                      struct posix_acl *pacl)
+{
+        return 0;
+}
+static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+                                struct posix_acl **dpacl,
+                                struct posix_acl **pacl)
+{
+        return 0;
+}
+#endif
+#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 6406f896bf95..b00223c99d70 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -149,6 +149,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
        switch (access) {
        case V9FS_ACCESS_SINGLE:
        case V9FS_ACCESS_USER:
+        case V9FS_ACCESS_CLIENT:
                uid = current_fsuid();
                any = 0;
                break;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 38dc0e067599..2f77cd33ba83 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -193,7 +193,17 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                                v9ses->flags |= V9FS_ACCESS_USER;
                        else if (strcmp(s, "any") == 0)
                                v9ses->flags |= V9FS_ACCESS_ANY;
-                        else {
+                        else if (strcmp(s, "client") == 0) {
+#ifdef CONFIG_9P_FS_POSIX_ACL
+                                v9ses->flags |= V9FS_ACCESS_CLIENT;
+#else
+                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                        "access=client option not supported\n");
+                                kfree(s);
+                                ret = -EINVAL;
+                                goto free_and_return;
+#endif
+                        } else {
                                v9ses->flags |= V9FS_ACCESS_SINGLE;
                                v9ses->uid = simple_strtoul(s, &e, 10);
                                if (*e != '\0')
@@ -278,6 +288,16 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
+        if (!v9fs_proto_dotl(v9ses) &&
+            ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+                /*
+                 * We support ACCESS_CLIENT only for dotl.
+                 * Fall back to ACCESS_USER
+                 */
+                v9ses->flags &= ~V9FS_ACCESS_MASK;
+                v9ses->flags |= V9FS_ACCESS_USER;
+        }
+        /*FIXME !! */
        /* for legacy mode, fall back to V9FS_ACCESS_ANY */
        if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
                ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 4c963c9fc41f..cb6396855e2d 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -33,13 +33,17 @@
 *
 * Session flags reflect options selected by users at mount time
 */
+#define V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \
+                         V9FS_ACCESS_USER |   \
+                         V9FS_ACCESS_CLIENT)
+#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
 enum p9_session_flags {
        V9FS_PROTO_2000U        = 0x01,
        V9FS_PROTO_2000L        = 0x02,
        V9FS_ACCESS_SINGLE      = 0x04,
        V9FS_ACCESS_USER        = 0x08,
-        V9FS_ACCESS_ANY         = 0x0C,
+        V9FS_ACCESS_CLIENT      = 0x10
-        V9FS_ACCESS_MASK        = 0x0C,
 };
 /* possible values of ->cache */
@@ -113,8 +117,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses);
 void v9fs_session_cancel(struct v9fs_session_info *v9ses);
 void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
-#define V9FS_MAGIC 0x01021997
 /* other default globals */
 #define V9FS_PORT       564
 #define V9FS_DEFUSER    "nobody"
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 88418c419ea7..bab0eac873f4 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -64,3 +64,7 @@ int v9fs_uflags2omode(int uflags, int extended);
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
+int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
+int v9fs_file_fsync_dotl(struct file *filp, int datasync);
+#define P9_LOCK_TIMEOUT (30*HZ)
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 90e38449f4b3..b7f2a8e3863e 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -154,10 +154,40 @@ static int v9fs_launder_page(struct page *page)
        return 0;
 }
+/**
+ * v9fs_direct_IO - 9P address space operation for direct I/O
+ * @rw: direction (read or write)
+ * @iocb: target I/O control block
+ * @iov: array of vectors that define I/O buffer
+ * @pos: offset in file to begin the operation
+ * @nr_segs: size of iovec array
+ *
+ * The presence of v9fs_direct_IO() in the address space ops vector
+ * allowes open() O_DIRECT flags which would have failed otherwise.
+ *
+ * In the non-cached mode, we shunt off direct read and write requests before
+ * the VFS gets them, so this method should never be called.
+ *
+ * Direct IO is not 'yet' supported in the cached mode. Hence when
+ * this routine is called through generic_file_aio_read(), the read/write fails
+ * with an error.
+ *
+ */
+ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+                loff_t pos, unsigned long nr_segs)
+{
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
+                        "off/no(%lld/%lu) EINVAL\n",
+                        iocb->ki_filp->f_path.dentry->d_name.name,
+                        (long long) pos, nr_segs);
+        return -EINVAL;
+}
 const struct address_space_operations v9fs_addr_operations = {
      .readpage = v9fs_vfs_readpage,
      .readpages = v9fs_vfs_readpages,
      .releasepage = v9fs_release_page,
      .invalidatepage = v9fs_invalidate_page,
      .launder_page = v9fs_launder_page,
+      .direct_IO = v9fs_direct_IO,
 };
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 899f168fd19c..b84ebe8cefed 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -242,7 +242,8 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
                while (rdir->head < rdir->tail) {
                        err = p9dirent_read(rdir->buf + rdir->head,
-                                                buflen - rdir->head, &curdirent,
+                                                rdir->tail - rdir->head,
+                                                &curdirent,
                                                fid->clnt->proto_version);
                        if (err < 0) {
                                P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
@@ -314,4 +315,5 @@ const struct file_operations v9fs_dir_operations_dotl = {
        .readdir = v9fs_dir_readdir_dotl,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
+        .fsync = v9fs_file_fsync_dotl,
 };
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index e97c92bd6f16..240c30674396 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -33,6 +33,7 @@
 #include <linux/inet.h>
 #include <linux/list.h>
 #include <linux/pagemap.h>
+#include <linux/utsname.h>
 #include <asm/uaccess.h>
 #include <linux/idr.h>
 #include <net/9p/9p.h>
@@ -44,6 +45,7 @@
 #include "cache.h"
 static const struct file_operations v9fs_cached_file_operations;
+static const struct file_operations v9fs_cached_file_operations_dotl;
 /**
 * v9fs_file_open - open a file (or directory)
@@ -92,6 +94,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
                /* enable cached file options */
                if(file->f_op == &v9fs_file_operations)
                        file->f_op = &v9fs_cached_file_operations;
+                else if (file->f_op == &v9fs_file_operations_dotl)
+                        file->f_op = &v9fs_cached_file_operations_dotl;
 #ifdef CONFIG_9P_FSCACHE
                v9fs_cache_inode_set_cookie(inode, file);
@@ -130,6 +134,206 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
        return res;
 }
+static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+        struct p9_flock flock;
+        struct p9_fid *fid;
+        uint8_t status;
+        int res = 0;
+        unsigned char fl_type;
+        fid = filp->private_data;
+        BUG_ON(fid == NULL);
+        if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
+                BUG();
+        res = posix_lock_file_wait(filp, fl);
+        if (res < 0)
+                goto out;
+        /* convert posix lock to p9 tlock args */
+        memset(&flock, 0, sizeof(flock));
+        flock.type = fl->fl_type;
+        flock.start = fl->fl_start;
+        if (fl->fl_end == OFFSET_MAX)
+                flock.length = 0;
+        else
+                flock.length = fl->fl_end - fl->fl_start + 1;
+        flock.proc_id = fl->fl_pid;
+        flock.client_id = utsname()->nodename;
+        if (IS_SETLKW(cmd))
+                flock.flags = P9_LOCK_FLAGS_BLOCK;
+        /*
+         * if its a blocked request and we get P9_LOCK_BLOCKED as the status
+         * for lock request, keep on trying
+         */
+        for (;;) {
+                res = p9_client_lock_dotl(fid, &flock, &status);
+                if (res < 0)
+                        break;
+                if (status != P9_LOCK_BLOCKED)
+                        break;
+                if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
+                        break;
+                schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
+        }
+        /* map 9p status to VFS status */
+        switch (status) {
+        case P9_LOCK_SUCCESS:
+                res = 0;
+                break;
+        case P9_LOCK_BLOCKED:
+                res = -EAGAIN;
+                break;
+        case P9_LOCK_ERROR:
+        case P9_LOCK_GRACE:
+                res = -ENOLCK;
+                break;
+        default:
+                BUG();
+        }
+        /*
+         * incase server returned error for lock request, revert
+         * it locally
+         */
+        if (res < 0 && fl->fl_type != F_UNLCK) {
+                fl_type = fl->fl_type;
+                fl->fl_type = F_UNLCK;
+                res = posix_lock_file_wait(filp, fl);
+                fl->fl_type = fl_type;
+        }
+out:
+        return res;
+}
+static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
+{
+        struct p9_getlock glock;
+        struct p9_fid *fid;
+        int res = 0;
+        fid = filp->private_data;
+        BUG_ON(fid == NULL);
+        posix_test_lock(filp, fl);
+        /*
+         * if we have a conflicting lock locally, no need to validate
+         * with server
+         */
+        if (fl->fl_type != F_UNLCK)
+                return res;
+        /* convert posix lock to p9 tgetlock args */
+        memset(&glock, 0, sizeof(glock));
+        glock.type = fl->fl_type;
+        glock.start = fl->fl_start;
+        if (fl->fl_end == OFFSET_MAX)
+                glock.length = 0;
+        else
+                glock.length = fl->fl_end - fl->fl_start + 1;
+        glock.proc_id = fl->fl_pid;
+        glock.client_id = utsname()->nodename;
+        res = p9_client_getlock_dotl(fid, &glock);
+        if (res < 0)
+                return res;
+        if (glock.type != F_UNLCK) {
+                fl->fl_type = glock.type;
+                fl->fl_start = glock.start;
+                if (glock.length == 0)
+                        fl->fl_end = OFFSET_MAX;
+                else
+                        fl->fl_end = glock.start + glock.length - 1;
+                fl->fl_pid = glock.proc_id;
+        } else
+                fl->fl_type = F_UNLCK;
+        return res;
+}
+/**
+ * v9fs_file_lock_dotl - lock a file (or directory)
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
+{
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        int ret = -ENOLCK;
+        P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+                                cmd, fl, filp->f_path.dentry->d_name.name);
+        /* No mandatory locks */
+        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+                goto out_err;
+        if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+                filemap_write_and_wait(inode->i_mapping);
+                invalidate_mapping_pages(&inode->i_data, 0, -1);
+        }
+        if (IS_SETLK(cmd) || IS_SETLKW(cmd))
+                ret = v9fs_file_do_lock(filp, cmd, fl);
+        else if (IS_GETLK(cmd))
+                ret = v9fs_file_getlock(filp, fl);
+        else
+                ret = -EINVAL;
+out_err:
+        return ret;
+}
+/**
+ * v9fs_file_flock_dotl - lock a file
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+static int v9fs_file_flock_dotl(struct file *filp, int cmd,
+        struct file_lock *fl)
+{
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        int ret = -ENOLCK;
+        P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+                                cmd, fl, filp->f_path.dentry->d_name.name);
+        /* No mandatory locks */
+        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+                goto out_err;
+        if (!(fl->fl_flags & FL_FLOCK))
+                goto out_err;
+        if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+                filemap_write_and_wait(inode->i_mapping);
+                invalidate_mapping_pages(&inode->i_data, 0, -1);
+        }
+        /* Convert flock to posix lock */
+        fl->fl_owner = (fl_owner_t)filp;
+        fl->fl_start = 0;
+        fl->fl_end = OFFSET_MAX;
+        fl->fl_flags |= FL_POSIX;
+        fl->fl_flags ^= FL_FLOCK;
+        if (IS_SETLK(cmd) | IS_SETLKW(cmd))
+                ret = v9fs_file_do_lock(filp, cmd, fl);
+        else
+                ret = -EINVAL;
+out_err:
+        return ret;
+}
 /**
 * v9fs_file_readn - read from a file
 * @filp: file pointer to read
@@ -219,7 +423,9 @@ static ssize_t
 v9fs_file_write(struct file *filp, const char __user * data,
                size_t count, loff_t * offset)
 {
-        int n, rsize, total = 0;
+        ssize_t retval;
+        size_t total = 0;
+        int n;
        struct p9_fid *fid;
        struct p9_client *clnt;
        struct inode *inode = filp->f_path.dentry->d_inode;
@@ -232,14 +438,19 @@ v9fs_file_write(struct file *filp, const char __user * data,
        fid = filp->private_data;
        clnt = fid->clnt;
-        rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ;
+        retval = generic_write_checks(filp, &origin, &count, 0);
+        if (retval)
+                goto out;
-        do {
+        retval = -EINVAL;
-                if (count < rsize)
+        if ((ssize_t) count < 0)
-                        rsize = count;
+                goto out;
+        retval = 0;
+        if (!count)
+                goto out;
-                n = p9_client_write(fid, NULL, data+total, origin+total,
+        do {
-                                                                        rsize);
+                n = p9_client_write(fid, NULL, data+total, origin+total, count);
                if (n <= 0)
                        break;
                count -= n;
@@ -258,9 +469,11 @@ v9fs_file_write(struct file *filp, const char __user * data,
        }
        if (n < 0)
-                return n;
+                retval = n;
+        else
-        return total;
+                retval = total;
+out:
+        return retval;
 }
 static int v9fs_file_fsync(struct file *filp, int datasync)
@@ -278,6 +491,20 @@ static int v9fs_file_fsync(struct file *filp, int datasync)
        return retval;
 }
+int v9fs_file_fsync_dotl(struct file *filp, int datasync)
+{
+        struct p9_fid *fid;
+        int retval;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
+                        filp, datasync);
+        fid = filp->private_data;
+        retval = p9_client_fsync(fid, datasync);
+        return retval;
+}
 static const struct file_operations v9fs_cached_file_operations = {
        .llseek = generic_file_llseek,
        .read = do_sync_read,
@@ -290,6 +517,19 @@ static const struct file_operations v9fs_cached_file_operations = {
        .fsync = v9fs_file_fsync,
 };
+static const struct file_operations v9fs_cached_file_operations_dotl = {
+        .llseek = generic_file_llseek,
+        .read = do_sync_read,
+        .aio_read = generic_file_aio_read,
+        .write = v9fs_file_write,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+        .lock = v9fs_file_lock_dotl,
+        .flock = v9fs_file_flock_dotl,
+        .mmap = generic_file_readonly_mmap,
+        .fsync = v9fs_file_fsync_dotl,
+};
 const struct file_operations v9fs_file_operations = {
        .llseek = generic_file_llseek,
        .read = v9fs_file_read,
@@ -307,7 +547,8 @@ const struct file_operations v9fs_file_operations_dotl = {
        .write = v9fs_file_write,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
-        .lock = v9fs_file_lock,
+        .lock = v9fs_file_lock_dotl,
+        .flock = v9fs_file_flock_dotl,
        .mmap = generic_file_readonly_mmap,
-        .fsync = v9fs_file_fsync,
+        .fsync = v9fs_file_fsync_dotl,
 };
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9e670d527646..34bf71b56542 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -36,6 +36,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -44,6 +45,7 @@
 #include "fid.h"
 #include "cache.h"
 #include "xattr.h"
+#include "acl.h"
 static const struct inode_operations v9fs_dir_inode_operations;
 static const struct inode_operations v9fs_dir_inode_operations_dotu;
@@ -53,6 +55,10 @@ static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
 static const struct inode_operations v9fs_symlink_inode_operations_dotl;
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                    dev_t rdev);
 /**
 * unixmode2p9mode - convert unix mode bits to plan 9
 * @v9ses: v9fs session information
@@ -500,6 +506,11 @@ v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
        v9fs_vcookie_set_qid(ret, &st->qid);
        v9fs_cache_inode_get_cookie(ret);
 #endif
+        err = v9fs_get_acl(ret, fid);
+        if (err) {
+                iput(ret);
+                goto error;
+        }
        kfree(st);
        return ret;
 error:
@@ -553,13 +564,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
        return retval;
 }
-static int
-v9fs_open_created(struct inode *inode, struct file *file)
-{
-        return 0;
-}
 /**
 * v9fs_create - Create a file
 * @v9ses: session information
@@ -655,29 +659,37 @@ error:
 */
 static int
-v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                struct nameidata *nd)
 {
        int err = 0;
        char *name = NULL;
        gid_t gid;
        int flags;
+        mode_t mode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid = NULL;
        struct p9_fid *dfid, *ofid;
        struct file *filp;
        struct p9_qid qid;
        struct inode *inode;
+        struct posix_acl *pacl = NULL, *dacl = NULL;
        v9ses = v9fs_inode2v9ses(dir);
        if (nd && nd->flags & LOOKUP_OPEN)
                flags = nd->intent.open.flags - 1;
-        else
+        else {
-                flags = O_RDWR;
+                /*
+                 * create call without LOOKUP_OPEN is due
+                 * to mknod of regular files. So use mknod
+                 * operation.
+                 */
+                return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+        }
        name = (char *) dentry->d_name.name;
        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
-                        "mode:0x%x\n", name, flags, mode);
+                        "mode:0x%x\n", name, flags, omode);
        dfid = v9fs_fid_lookup(dentry->d_parent);
        if (IS_ERR(dfid)) {
@@ -695,6 +707,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
        }
        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in creat %d\n", err);
+                goto error;
+        }
        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
        if (err < 0) {
                P9_DPRINTK(P9_DEBUG_VFS,
@@ -702,46 +723,52 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
                                err);
                goto error;
        }
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE ||
+            (nd && nd->flags & LOOKUP_OPEN)) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
-        /* No need to populate the inode if we are not opening the file AND
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-         * not in cached mode.
+                if (IS_ERR(inode)) {
-         */
+                        err = PTR_ERR(inode);
-        if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) {
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                /* Not in cached mode. No need to populate inode with stat */
+                                err);
-                dentry->d_op = &v9fs_dentry_operations;
+                        goto error;
-                p9_client_clunk(ofid);
+                }
-                d_instantiate(dentry, NULL);
-                return 0;
-        }
-        /* Now walk from the parent so we can get an unopened fid. */
-        fid = p9_client_walk(dfid, 1, &name, 1);
-        if (IS_ERR(fid)) {
-                err = PTR_ERR(fid);
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-                fid = NULL;
-                goto error;
-        }
-        /* instantiate inode and assign the unopened fid to dentry */
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-        if (IS_ERR(inode)) {
-                err = PTR_ERR(inode);
-                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
-                goto error;
-        }
-        if (v9ses->cache)
                dentry->d_op = &v9fs_cached_dentry_operations;
-        else
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                /* The fid would get clunked via a dput */
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate
+                 * inode with stat. We need to get an inode
+                 * so that we can set the acl with dentry
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
                dentry->d_op = &v9fs_dentry_operations;
-        d_instantiate(dentry, inode);
+                d_instantiate(dentry, inode);
-        err = v9fs_fid_add(dentry, fid);
+        }
-        if (err < 0)
+        /* Now set the ACL based on the default value */
-                goto error;
+        v9fs_set_create_acl(dentry, dacl, pacl);
        /* if we are opening a file, assign the open fid to the file */
        if (nd && nd->flags & LOOKUP_OPEN) {
-                filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
+                filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
                if (IS_ERR(filp)) {
                        p9_client_clunk(ofid);
                        return PTR_ERR(filp);
@@ -800,7 +827,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        /* if we are opening a file, assign the open fid to the file */
        if (nd && nd->flags & LOOKUP_OPEN) {
-                filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
+                filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
                if (IS_ERR(filp)) {
                        err = PTR_ERR(filp);
                        goto error;
@@ -859,23 +886,28 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 *
 */
-static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
+static int v9fs_vfs_mkdir_dotl(struct inode *dir,
-                                        int mode)
+                               struct dentry *dentry, int omode)
 {
        int err;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid = NULL, *dfid = NULL;
        gid_t gid;
        char *name;
+        mode_t mode;
        struct inode *inode;
        struct p9_qid qid;
        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
        err = 0;
        v9ses = v9fs_inode2v9ses(dir);
-        mode |= S_IFDIR;
+        omode |= S_IFDIR;
+        if (dir->i_mode & S_ISGID)
+                omode |= S_ISGID;
        dir_dentry = v9fs_dentry_from_dir_inode(dir);
        dfid = v9fs_fid_lookup(dir_dentry);
        if (IS_ERR(dfid)) {
@@ -886,11 +918,14 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
        }
        gid = v9fs_get_fsgid_for_create(dir);
-        if (gid < 0) {
+        mode = omode;
-                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mkdir %d\n", err);
                goto error;
        }
        name = (char *) dentry->d_name.name;
        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
        if (err < 0)
@@ -920,7 +955,23 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
                if (err < 0)
                        goto error;
                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate
+                 * inode with stat. We need to get an inode
+                 * so that we can set the acl with dentry
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                dentry->d_op = &v9fs_dentry_operations;
+                d_instantiate(dentry, inode);
        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
 error:
        if (fid)
                p9_client_clunk(fid);
@@ -979,7 +1030,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        result = v9fs_fid_add(dentry, fid);
        if (result < 0)
-                goto error;
+                goto error_iput;
 inst_out:
        if (v9ses->cache)
@@ -990,6 +1041,8 @@ inst_out:
        d_add(dentry, inode);
        return NULL;
+error_iput:
+        iput(inode);
 error:
        p9_client_clunk(fid);
@@ -1237,7 +1290,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 *
 */
-static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 {
        int retval;
        struct v9fs_session_info *v9ses;
@@ -1279,6 +1332,12 @@ static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
        setattr_copy(dentry->d_inode, iattr);
        mark_inode_dirty(dentry->d_inode);
+        if (iattr->ia_valid & ATTR_MODE) {
+                /* We also want to update ACL when we update mode bits */
+                retval = v9fs_acl_chmod(dentry);
+                if (retval < 0)
+                        return retval;
+        }
        return 0;
 }
@@ -1473,7 +1532,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
        if (IS_ERR(fid))
                return PTR_ERR(fid);
-        if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))
+        if (!v9fs_proto_dotu(v9ses))
                return -EBADF;
        st = p9_client_stat(fid);
@@ -1616,11 +1675,6 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
        gid = v9fs_get_fsgid_for_create(dir);
-        if (gid < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid);
-                goto error;
-        }
        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
@@ -1789,9 +1843,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                kfree(st);
        } else {
                /* Caching disabled. No need to get upto date stat info.
-                 * This dentry will be released immediately. So, just i_count++
+                 * This dentry will be released immediately. So, just hold the
+                 * inode
                 */
-                atomic_inc(&old_dentry->d_inode->i_count);
+                ihold(old_dentry->d_inode);
        }
        dentry->d_op = old_dentry->d_op;
@@ -1854,21 +1909,23 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 *
 */
 static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                dev_t rdev)
 {
        int err;
        char *name;
+        mode_t mode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid = NULL, *dfid = NULL;
        struct inode *inode;
        gid_t gid;
        struct p9_qid qid;
        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
        P9_DPRINTK(P9_DEBUG_VFS,
                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
-                dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+                dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
        if (!new_valid_dev(rdev))
                return -EINVAL;
@@ -1884,11 +1941,14 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
        }
        gid = v9fs_get_fsgid_for_create(dir);
-        if (gid < 0) {
+        mode = omode;
-                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mknod %d\n", err);
                goto error;
        }
        name = (char *) dentry->d_name.name;
        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
@@ -1932,13 +1992,68 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
                dentry->d_op = &v9fs_dentry_operations;
                d_instantiate(dentry, inode);
        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
 error:
        if (fid)
                p9_client_clunk(fid);
        return err;
 }
+static int
+v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen)
+{
+        int retval;
+        struct p9_fid *fid;
+        char *target = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
+        retval = -EPERM;
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        retval = p9_client_readlink(fid, &target);
+        if (retval < 0)
+                return retval;
+        strncpy(buffer, target, buflen);
+        P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer);
+        retval = strnlen(buffer, buflen);
+        return retval;
+}
+/**
+ * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+static void *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+{
+        int len = 0;
+        char *link = __getname();
+        P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
+        if (!link)
+                link = ERR_PTR(-ENOMEM);
+        else {
+                len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX);
+                if (len < 0) {
+                        __putname(link);
+                        link = ERR_PTR(len);
+                } else
+                        link[min(len, PATH_MAX-1)] = 0;
+        }
+        nd_set_link(nd, link);
+        return NULL;
+}
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -1969,7 +2084,7 @@ static const struct inode_operations v9fs_dir_inode_operations_dotl = {
        .getxattr = generic_getxattr,
        .removexattr = generic_removexattr,
        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
 };
 static const struct inode_operations v9fs_dir_inode_operations = {
@@ -1996,6 +2111,7 @@ static const struct inode_operations v9fs_file_inode_operations_dotl = {
        .getxattr = generic_getxattr,
        .removexattr = generic_removexattr,
        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
 };
 static const struct inode_operations v9fs_symlink_inode_operations = {
@@ -2007,8 +2123,8 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
 };
 static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
-        .readlink = generic_readlink,
+        .readlink = v9fs_vfs_readlink_dotl,
-        .follow_link = v9fs_vfs_follow_link,
+        .follow_link = v9fs_vfs_follow_link_dotl,
        .put_link = v9fs_vfs_put_link,
        .getattr = v9fs_vfs_getattr_dotl,
        .setattr = v9fs_vfs_setattr_dotl,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 1d12ba0ed3db..c55c614500ad 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -39,6 +39,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/statfs.h>
+#include <linux/magic.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -46,6 +47,7 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
 #include "xattr.h"
+#include "acl.h"
 static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
@@ -66,7 +68,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
 * v9fs_fill_super - populate superblock with info
 * @sb: superblock
 * @v9ses: session information
- * @flags: flags propagated from v9fs_get_sb()
+ * @flags: flags propagated from v9fs_mount()
 *
 */
@@ -88,22 +90,25 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
            MS_NOATIME;
+#ifdef CONFIG_9P_FS_POSIX_ACL
+        if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
+                sb->s_flags |= MS_POSIXACL;
+#endif
        save_mount_options(sb, data);
 }
 /**
- * v9fs_get_sb - mount a superblock
+ * v9fs_mount - mount a superblock
 * @fs_type: file system type
 * @flags: mount flags
 * @dev_name: device name that was mounted
 * @data: mount options
- * @mnt: mountpoint record to be instantiated
 *
 */
-static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data,
+                       const char *dev_name, void *data)
-                       struct vfsmount *mnt)
 {
        struct super_block *sb = NULL;
        struct inode *inode = NULL;
@@ -117,7 +122,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
        if (!v9ses)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        fid = v9fs_session_init(v9ses, dev_name, data);
        if (IS_ERR(fid)) {
@@ -149,7 +154,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                goto release_sb;
        }
        sb->s_root = root;
        if (v9fs_proto_dotl(v9ses)) {
                struct p9_stat_dotl *st = NULL;
                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
@@ -174,19 +178,21 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                p9stat_free(st);
                kfree(st);
        }
+        retval = v9fs_get_acl(inode, fid);
+        if (retval)
+                goto release_sb;
        v9fs_fid_add(root, fid);
        P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 clunk_fid:
        p9_client_clunk(fid);
 close_session:
        v9fs_session_close(v9ses);
        kfree(v9ses);
-        return retval;
+        return ERR_PTR(retval);
 release_sb:
        /*
         * we will do the session_close and root dentry release
@@ -196,7 +202,7 @@ release_sb:
         */
        p9_client_clunk(fid);
        deactivate_locked_super(sb);
-        return retval;
+        return ERR_PTR(retval);
 }
 /**
@@ -249,7 +255,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
        if (v9fs_proto_dotl(v9ses)) {
                res = p9_client_statfs(fid, &rs);
                if (res == 0) {
-                        buf->f_type = rs.type;
+                        buf->f_type = V9FS_MAGIC;
                        buf->f_bsize = rs.bsize;
                        buf->f_blocks = rs.blocks;
                        buf->f_bfree = rs.bfree;
@@ -292,7 +298,7 @@ static const struct super_operations v9fs_super_ops_dotl = {
 struct file_system_type v9fs_fs_type = {
        .name = "9p",
-        .get_sb = v9fs_get_sb,
+        .mount = v9fs_mount,
        .kill_sb = v9fs_kill_super,
        .owner = THIS_MODULE,
        .fs_flags = FS_RENAME_DOES_D_MOVE,
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index f88e5c2dc873..43ec7df84336 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -21,30 +21,13 @@
 #include "fid.h"
 #include "xattr.h"
-/*
+ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
- * v9fs_xattr_get()
+                           void *buffer, size_t buffer_size)
- *
- * Copy an extended attribute into the buffer
- * provided, or compute the buffer size required.
- * Buffer is NULL to compute the size of the buffer required.
- *
- * Returns a negative error number on failure, or the number of bytes
- * used / required on success.
- */
-ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
-                       void *buffer, size_t buffer_size)
 {
        ssize_t retval;
        int msize, read_count;
        u64 offset = 0, attr_size;
-        struct p9_fid *fid, *attr_fid;
+        struct p9_fid *attr_fid;
-        P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
-                __func__, name, buffer_size);
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
        attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
        if (IS_ERR(attr_fid)) {
@@ -88,6 +71,31 @@ error:
 }
+/*
+ * v9fs_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
+                       void *buffer, size_t buffer_size)
+{
+        struct p9_fid *fid;
+        P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
+                __func__, name, buffer_size);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        return v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
+}
 /*
 * v9fs_xattr_set()
 *
@@ -156,5 +164,9 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 const struct xattr_handler *v9fs_xattr_handlers[] = {
        &v9fs_xattr_user_handler,
+#ifdef CONFIG_9P_FS_POSIX_ACL
+        &v9fs_xattr_acl_access_handler,
+        &v9fs_xattr_acl_default_handler,
+#endif
        NULL
 };
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index 9ddf672ae5c4..eaa837c53bd5 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -15,10 +15,16 @@
 #define FS_9P_XATTR_H
 #include <linux/xattr.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
 extern const struct xattr_handler *v9fs_xattr_handlers[];
 extern struct xattr_handler v9fs_xattr_user_handler;
+extern const struct xattr_handler v9fs_xattr_acl_access_handler;
+extern const struct xattr_handler v9fs_xattr_acl_default_handler;
+extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
+                                  void *, size_t);
 extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
                              void *, size_t);
 extern int v9fs_xattr_set(struct dentry *, const char *,
diff --git a/fs/Kconfig b/fs/Kconfig
index 65781de44fc0..771f457402d4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,10 +47,12 @@ source "fs/nilfs2/Kconfig"
 endif # BLOCK
+config EXPORTFS
+        tristate
 config FILE_LOCKING
        bool "Enable POSIX file locking API" if EMBEDDED
        default y
-        select BKL # while lockd still uses it.
        help
          This option enables standard file locking support, required
          for filesystems like NFS and for the flock() system
@@ -60,7 +62,6 @@ source "fs/notify/Kconfig"
 source "fs/quota/Kconfig"
-source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
@@ -222,9 +223,6 @@ config LOCKD_V4
        depends on FILE_LOCKING
        default y
-config EXPORTFS
-        tristate
 config NFS_ACL_SUPPORT
        tristate
        select FS_POSIX_ACL
@@ -235,7 +233,6 @@ config NFS_COMMON
        default y
 source "net/sunrpc/Kconfig"
-source "fs/smbfs/Kconfig"
 source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index bb4cc5b8abc8..79e2ca7973b7 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC
 config CORE_DUMP_DEFAULT_ELF_HEADERS
        bool "Write ELF core dumps with partial segments"
-        default n
+        default y
        depends on BINFMT_ELF && ELF_CORE
        help
          ELF core dump files describe each memory mapping of the crashed
@@ -60,7 +60,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS
          inherited.  See Documentation/filesystems/proc.txt for details.
          This config option changes the default setting of coredump_filter
-          seen at boot time.  If unsure, say N.
+          seen at boot time.  If unsure, say Y.
 config BINFMT_FLAT
        bool "Kernel support for flat binaries"
diff --git a/fs/Makefile b/fs/Makefile
index e6ec1d309b1d..a7f7cef0c0c8 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -29,10 +29,7 @@ obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)            += compat.o compat_ioctl.o
+obj-$(CONFIG_NFSD_DEPRECATED)   += nfsctl.o
-nfsd-$(CONFIG_NFSD)             := nfsctl.o
-obj-y                           += $(nfsd-y) $(nfsd-m)
 obj-$(CONFIG_BINFMT_AOUT)       += binfmt_aout.o
 obj-$(CONFIG_BINFMT_EM86)       += binfmt_em86.o
 obj-$(CONFIG_BINFMT_MISC)       += binfmt_misc.o
@@ -91,7 +88,6 @@ obj-$(CONFIG_NFSD)		+= nfsd/
 obj-$(CONFIG_LOCKD)             += lockd/
 obj-$(CONFIG_NLS)               += nls/
 obj-$(CONFIG_SYSV_FS)           += sysv/
-obj-$(CONFIG_SMB_FS)            += smbfs/
 obj-$(CONFIG_CIFS)              += cifs/
 obj-$(CONFIG_NCP_FS)            += ncpfs/
 obj-$(CONFIG_HPFS_FS)           += hpfs/
@@ -104,7 +100,6 @@ obj-$(CONFIG_UBIFS_FS)		+= ubifs/
 obj-$(CONFIG_AFFS_FS)           += affs/
 obj-$(CONFIG_ROMFS_FS)          += romfs/
 obj-$(CONFIG_QNX4FS_FS)         += qnx4/
-obj-$(CONFIG_AUTOFS_FS)         += autofs/
 obj-$(CONFIG_AUTOFS4_FS)        += autofs4/
 obj-$(CONFIG_ADFS_FS)           += adfs/
 obj-$(CONFIG_FUSE_FS)           += fuse/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index d9803f73236f..959dbff2d42d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -490,17 +490,16 @@ error:
        return -EINVAL;
 }
-static int adfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *adfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
-                           mnt);
 }
 static struct file_system_type adfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "adfs",
-        .get_sb         = adfs_get_sb,
+        .mount          = adfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/affs/file.c b/fs/affs/file.c
index c4a9875bd1a6..0a90dcd46de2 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -894,9 +894,9 @@ affs_truncate(struct inode *inode)
                if (AFFS_SB(sb)->s_flags & SF_OFS) {
                        struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
                        u32 tmp;
-                        if (IS_ERR(ext_bh)) {
+                        if (IS_ERR(bh)) {
                                affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)",
-                                             ext, PTR_ERR(ext_bh));
+                                             ext, PTR_ERR(bh));
                                return;
                        }
                        tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3a0fdec175ba..5d828903ac69 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -388,7 +388,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
                affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
                mark_buffer_dirty_inode(inode_bh, inode);
                inode->i_nlink = 2;
-                atomic_inc(&inode->i_count);
+                ihold(inode);
        }
        affs_fix_checksum(sb, bh);
        mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index fa4fbe1e238a..0cf7f4384cbd 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -573,17 +573,16 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static int affs_get_sb(struct file_system_type *fs_type,
+static struct dentry *affs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
-                           mnt);
 }
 static struct file_system_type affs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "affs",
-        .get_sb         = affs_get_sb,
+        .mount          = affs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 0d38c09bd55e..5439e1bc9a86 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1045,7 +1045,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
        if (ret < 0)
                goto link_error;
-        atomic_inc(&vnode->vfs_inode.i_count);
+        ihold(&vnode->vfs_inode);
        d_instantiate(dentry, &vnode->vfs_inode);
        key_put(key);
        _leave(" = 0");
diff --git a/fs/afs/super.c b/fs/afs/super.c
index eacf76d98ae0..27201cffece4 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -29,9 +29,8 @@
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
 static void afs_i_init_once(void *foo);
-static int afs_get_sb(struct file_system_type *fs_type,
+static struct dentry *afs_mount(struct file_system_type *fs_type,
-                      int flags, const char *dev_name,
+                      int flags, const char *dev_name, void *data);
-                      void *data, struct vfsmount *mnt);
 static struct inode *afs_alloc_inode(struct super_block *sb);
 static void afs_put_super(struct super_block *sb);
 static void afs_destroy_inode(struct inode *inode);
@@ -40,7 +39,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
 struct file_system_type afs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "afs",
-        .get_sb         = afs_get_sb,
+        .mount          = afs_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = 0,
 };
@@ -359,11 +358,8 @@ error:
 /*
 * get an AFS superblock
 */
-static int afs_get_sb(struct file_system_type *fs_type,
+static struct dentry *afs_mount(struct file_system_type *fs_type,
-                      int flags,
+                      int flags, const char *dev_name, void *options)
-                      const char *dev_name,
-                      void *options,
-                      struct vfsmount *mnt)
 {
        struct afs_mount_params params;
        struct super_block *sb;
@@ -427,12 +423,11 @@ static int afs_get_sb(struct file_system_type *fs_type,
                ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
        }
-        simple_set_mnt(mnt, sb);
        afs_put_volume(params.volume);
        afs_put_cell(params.cell);
        kfree(new_opts);
        _leave(" = 0 [%p]", sb);
-        return 0;
+        return dget(sb->s_root);
 error:
        afs_put_volume(params.volume);
@@ -440,7 +435,7 @@ error:
        key_put(params.key);
        kfree(new_opts);
        _leave(" = %d", ret);
-        return ret;
+        return ERR_PTR(ret);
 }
 /*
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 722743b152d8..15690bb1d3b5 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -438,7 +438,6 @@ no_more:
 */
 int afs_writepage(struct page *page, struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = page->mapping->backing_dev_info;
        struct afs_writeback *wb;
        int ret;
@@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc)
        }
        wbc->nr_to_write -= ret;
-        if (wbc->nonblocking && bdi_write_congested(bdi))
-                wbc->encountered_congestion = 1;
        _leave(" = 0");
        return 0;
@@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping,
                                 struct writeback_control *wbc,
                                 pgoff_t index, pgoff_t end, pgoff_t *_next)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        struct afs_writeback *wb;
        struct page *page;
        int ret, n;
@@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping,
                wbc->nr_to_write -= ret;
-                if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                        wbc->encountered_congestion = 1;
-                        break;
-                }
                cond_resched();
        } while (index < end && wbc->nr_to_write > 0);
@@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping,
 int afs_writepages(struct address_space *mapping,
                   struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        pgoff_t start, end, next;
        int ret;
        _enter("");
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                wbc->encountered_congestion = 1;
-                _leave(" = 0 [congest]");
-                return 0;
-        }
        if (wbc->range_cyclic) {
                start = mapping->writeback_index;
                end = -1;
                ret = afs_writepages_region(mapping, wbc, start, end, &next);
-                if (start > 0 && wbc->nr_to_write > 0 && ret == 0 &&
+                if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
-                    !(wbc->nonblocking && wbc->encountered_congestion))
                        ret = afs_writepages_region(mapping, wbc, 0, start,
                                                    &next);
                mapping->writeback_index = next;
diff --git a/fs/aio.c b/fs/aio.c
index 250b0a73c8a8..8c8f6c5b6d79 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1543,7 +1543,19 @@ static void aio_batch_add(struct address_space *mapping,
        }
        abe = mempool_alloc(abe_pool, GFP_KERNEL);
-        BUG_ON(!igrab(mapping->host));
+        /*
+         * we should be using igrab here, but
+         * we don't want to hammer on the global
+         * inode spinlock just to take an extra
+         * reference on a file that we must already
+         * have a reference to.
+         *
+         * When we're called, we always have a reference
+         * on the file, so we must always have a reference
+         * on the inode, so ihold() is safe here.
+         */
+        ihold(mapping->host);
        abe->mapping = mapping;
        hlist_add_head(&abe->list, &batch_hash[bucket]);
        return;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda83..57ce55b2564c 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,12 +26,10 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
 static struct inode *anon_inode_inode;
 static const struct file_operations anon_inode_fops;
-static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
-                               const char *dev_name, void *data,
+                                int flags, const char *dev_name, void *data)
-                               struct vfsmount *mnt)
 {
-        return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC,
+        return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC);
-                             mnt);
 }
 /*
@@ -45,7 +43,7 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
 static struct file_system_type anon_inode_fs_type = {
        .name           = "anon_inodefs",
-        .get_sb         = anon_inodefs_get_sb,
+        .mount          = anon_inodefs_mount,
        .kill_sb        = kill_anon_super,
 };
 static const struct dentry_operations anon_inodefs_dentry_operations = {
@@ -111,10 +109,9 @@ struct file *anon_inode_getfile(const char *name,
        path.mnt = mntget(anon_inode_mnt);
        /*
         * We know the anon_inode inode count is always greater than zero,
-         * so we can avoid doing an igrab() and we can use an open-coded
+         * so ihold() is safe.
-         * atomic_inc().
         */
-        atomic_inc(&anon_inode_inode->i_count);
+        ihold(anon_inode_inode);
        path.dentry->d_op = &anon_inodefs_dentry_operations;
        d_instantiate(path.dentry, anon_inode_inode);
@@ -194,6 +191,7 @@ static struct inode *anon_inode_mkinode(void)
        if (!inode)
                return ERR_PTR(-ENOMEM);
+        inode->i_ino = get_next_ino();
        inode->i_fop = &anon_inode_fops;
        inode->i_mapping->a_ops = &anon_aops;
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
deleted file mode 100644
index 480e210c83ab..000000000000
--- a/fs/autofs/Kconfig
+++ /dev/null
@@ -1,22 +0,0 @@
-config AUTOFS_FS
-        tristate "Kernel automounter support"
-        depends on BKL # unfixable, just use autofs4
-        help
-          The automounter is a tool to automatically mount remote file systems
-          on demand. This implementation is partially kernel-based to reduce
-          overhead in the already-mounted case; this is unlike the BSD
-          automounter (amd), which is a pure user space daemon.
-          To use the automounter you need the user-space tools from the autofs
-          package; you can find the location in <file:Documentation/Changes>.
-          You also want to answer Y to "NFS file system support", below.
-          If you want to use the newer version of the automounter with more
-          features, say N here and say Y to "Kernel automounter v4 support",
-          below.
-          To compile this support as a module, choose M here: the module will be
-          called autofs.
-          If you are not a part of a fairly large, distributed network, you
-          probably do not need an automounter, and can say N here.
diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile
deleted file mode 100644
index 453a60f46d05..000000000000
--- a/fs/autofs/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the linux autofs-filesystem routines.
-#
-obj-$(CONFIG_AUTOFS_FS) += autofs.o
-autofs-objs := dirhash.o init.o inode.o root.o symlink.o waitq.o
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
deleted file mode 100644
index 901a3e67ec45..000000000000
--- a/fs/autofs/autofs_i.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *   
- * linux/fs/autofs/autofs_i.h
- *
- *   Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-/* Internal header file for autofs */
-#include <linux/auto_fs.h>
-/* This is the range of ioctl() numbers we claim as ours */
-#define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
-#define AUTOFS_IOC_COUNT     32
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/string.h>
-#include <linux/wait.h>
-#include <linux/dcache.h>
-#include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/sched.h>
-#include <asm/current.h>
-#include <asm/uaccess.h>
-#ifdef DEBUG
-#define DPRINTK(D) (printk D)
-#else
-#define DPRINTK(D) ((void)0)
-#endif
-/*
- * If the daemon returns a negative response (AUTOFS_IOC_FAIL) then the
- * kernel will keep the negative response cached for up to the time given
- * here, although the time can be shorter if the kernel throws the dcache
- * entry away.  This probably should be settable from user space.
- */
-#define AUTOFS_NEGATIVE_TIMEOUT (60*HZ) /* 1 minute */
-/* Structures associated with the root directory hash table */
-#define AUTOFS_HASH_SIZE 67
-struct autofs_dir_ent {
-        int hash;
-        char *name;
-        int len;
-        ino_t ino;
-        struct dentry *dentry;
-        /* Linked list of entries */
-        struct autofs_dir_ent *next;
-        struct autofs_dir_ent **back;
-        /* The following entries are for the expiry system */
-        unsigned long last_usage;
-        struct list_head exp;
-};
-struct autofs_dirhash {
-        struct autofs_dir_ent *h[AUTOFS_HASH_SIZE];
-        struct list_head expiry_head;
-};
-struct autofs_wait_queue {
-        wait_queue_head_t queue;
-        struct autofs_wait_queue *next;
-        autofs_wqt_t wait_queue_token;
-        /* We use the following to see what we are waiting for */
-        int hash;
-        int len;
-        char *name;
-        /* This is for status reporting upon return */
-        int status;
-        int wait_ctr;
-};
-struct autofs_symlink {
-        char *data;
-        int len;
-        time_t mtime;
-};
-#define AUTOFS_MAX_SYMLINKS 256
-#define AUTOFS_ROOT_INO      1
-#define AUTOFS_FIRST_SYMLINK 2
-#define AUTOFS_FIRST_DIR_INO (AUTOFS_FIRST_SYMLINK+AUTOFS_MAX_SYMLINKS)
-#define AUTOFS_SYMLINK_BITMAP_LEN \
-        ((AUTOFS_MAX_SYMLINKS+((sizeof(long)*1)-1))/(sizeof(long)*8))
-#define AUTOFS_SBI_MAGIC 0x6d4a556d
-struct autofs_sb_info {
-        u32 magic;
-        struct file *pipe;
-        struct pid *oz_pgrp;
-        int catatonic;
-        struct super_block *sb;
-        unsigned long exp_timeout;
-        ino_t next_dir_ino;
-        struct autofs_wait_queue *queues; /* Wait queue pointer */
-        struct autofs_dirhash dirhash; /* Root directory hash */
-        struct autofs_symlink symlink[AUTOFS_MAX_SYMLINKS];
-        unsigned long symlink_bitmap[AUTOFS_SYMLINK_BITMAP_LEN];
-};
-static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
-{
-        return (struct autofs_sb_info *)(sb->s_fs_info);
-}
-/* autofs_oz_mode(): do we see the man behind the curtain?  (The
-   processes which do manipulations for us in user space sees the raw
-   filesystem without "magic".) */
-static inline int autofs_oz_mode(struct autofs_sb_info *sbi) {
-        return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
-}
-/* Hash operations */
-void autofs_initialize_hash(struct autofs_dirhash *);
-struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *,struct qstr *);
-void autofs_hash_insert(struct autofs_dirhash *,struct autofs_dir_ent *);
-void autofs_hash_delete(struct autofs_dir_ent *);
-struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *,off_t *,struct autofs_dir_ent *);
-void autofs_hash_dputall(struct autofs_dirhash *);
-void autofs_hash_nuke(struct autofs_sb_info *);
-/* Expiration-handling functions */
-void autofs_update_usage(struct autofs_dirhash *,struct autofs_dir_ent *);
-struct autofs_dir_ent *autofs_expire(struct super_block *,struct autofs_sb_info *, struct vfsmount *mnt);
-/* Operations structures */
-extern const struct inode_operations autofs_root_inode_operations;
-extern const struct inode_operations autofs_symlink_inode_operations;
-extern const struct file_operations autofs_root_operations;
-/* Initializing function */
-int autofs_fill_super(struct super_block *, void *, int);
-void autofs_kill_sb(struct super_block *sb);
-struct inode *autofs_iget(struct super_block *, unsigned long);
-/* Queue management functions */
-int autofs_wait(struct autofs_sb_info *,struct qstr *);
-int autofs_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
-void autofs_catatonic_mode(struct autofs_sb_info *);
-#ifdef DEBUG
-void autofs_say(const char *name, int len);
-#else
-#define autofs_say(n,l) ((void)0)
-#endif
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
deleted file mode 100644
index e947915109e5..000000000000
--- a/fs/autofs/dirhash.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/dirhash.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include "autofs_i.h"
-/* Functions for maintenance of expiry queue */
-static void autofs_init_usage(struct autofs_dirhash *dh,
-                              struct autofs_dir_ent *ent)
-{
-        list_add_tail(&ent->exp, &dh->expiry_head);
-        ent->last_usage = jiffies;
-}
-static void autofs_delete_usage(struct autofs_dir_ent *ent)
-{
-        list_del(&ent->exp);
-}
-void autofs_update_usage(struct autofs_dirhash *dh,
-                         struct autofs_dir_ent *ent)
-{
-        autofs_delete_usage(ent);   /* Unlink from current position */
-        autofs_init_usage(dh,ent);  /* Relink at queue tail */
-}
-struct autofs_dir_ent *autofs_expire(struct super_block *sb,
-                                     struct autofs_sb_info *sbi,
-                                     struct vfsmount *mnt)
-{
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        unsigned long timeout = sbi->exp_timeout;
-        while (1) {
-                struct path path;
-                int umount_ok;
-                if ( list_empty(&dh->expiry_head) || sbi->catatonic )
-                        return NULL;    /* No entries */
-                /* We keep the list sorted by last_usage and want old stuff */
-                ent = list_entry(dh->expiry_head.next, struct autofs_dir_ent, exp);
-                if (jiffies - ent->last_usage < timeout)
-                        break;
-                /* Move to end of list in case expiry isn't desirable */
-                autofs_update_usage(dh, ent);
-                /* Check to see that entry is expirable */
-                if ( ent->ino < AUTOFS_FIRST_DIR_INO )
-                        return ent; /* Symlinks are always expirable */
-                /* Get the dentry for the autofs subdirectory */
-                path.dentry = ent->dentry;
-                if (!path.dentry) {
-                        /* Should only happen in catatonic mode */
-                        printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
-                        autofs_delete_usage(ent);
-                        continue;
-                }
-                if (!path.dentry->d_inode) {
-                        dput(path.dentry);
-                        printk("autofs: negative dentry on expiry queue: %s\n",
-                               ent->name);
-                        autofs_delete_usage(ent);
-                        continue;
-                }
-                /* Make sure entry is mounted and unused; note that dentry will
-                   point to the mounted-on-top root. */
-                if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
-                    !d_mountpoint(path.dentry)) {
-                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
-                        continue;
-                }
-                path.mnt = mnt;
-                path_get(&path);
-                if (!follow_down(&path)) {
-                        path_put(&path);
-                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
-                        continue;
-                }
-                while (d_mountpoint(path.dentry) && follow_down(&path))
-                        ;
-                umount_ok = may_umount(path.mnt);
-                path_put(&path);
-                if (umount_ok) {
-                        DPRINTK(("autofs: signaling expire on %s\n", ent->name));
-                        return ent; /* Expirable! */
-                }
-                DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
-        }
-        return NULL;            /* No expirable entries */
-}
-void autofs_initialize_hash(struct autofs_dirhash *dh) {
-        memset(&dh->h, 0, AUTOFS_HASH_SIZE*sizeof(struct autofs_dir_ent *));
-        INIT_LIST_HEAD(&dh->expiry_head);
-}
-struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *dh, struct qstr *name)
-{
-        struct autofs_dir_ent *dhn;
-        DPRINTK(("autofs_hash_lookup: hash = 0x%08x, name = ", name->hash));
-        autofs_say(name->name,name->len);
-        for ( dhn = dh->h[(unsigned) name->hash % AUTOFS_HASH_SIZE] ; dhn ; dhn = dhn->next ) {
-                if ( name->hash == dhn->hash &&
-                     name->len == dhn->len &&
-                     !memcmp(name->name, dhn->name, name->len) )
-                        break;
-        }
-        return dhn;
-}
-void autofs_hash_insert(struct autofs_dirhash *dh, struct autofs_dir_ent *ent)
-{
-        struct autofs_dir_ent **dhnp;
-        DPRINTK(("autofs_hash_insert: hash = 0x%08x, name = ", ent->hash));
-        autofs_say(ent->name,ent->len);
-        autofs_init_usage(dh,ent);
-        if (ent->dentry)
-                dget(ent->dentry);
-        dhnp = &dh->h[(unsigned) ent->hash % AUTOFS_HASH_SIZE];
-        ent->next = *dhnp;
-        ent->back = dhnp;
-        *dhnp = ent;
-        if ( ent->next )
-                ent->next->back = &(ent->next);
-}
-void autofs_hash_delete(struct autofs_dir_ent *ent)
-{
-        *(ent->back) = ent->next;
-        if ( ent->next )
-                ent->next->back = ent->back;
-        autofs_delete_usage(ent);
-        if ( ent->dentry )
-                dput(ent->dentry);
-        kfree(ent->name);
-        kfree(ent);
-}
-/*
- * Used by readdir().  We must validate "ptr", so we can't simply make it
- * a pointer.  Values below 0xffff are reserved; calling with any value
- * <= 0x10000 will return the first entry found.
- *
- * "last" can be NULL or the value returned by the last search *if* we
- * want the next sequential entry.
- */
-struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *dh,
-                                        off_t *ptr, struct autofs_dir_ent *last)
-{
-        int bucket, ecount, i;
-        struct autofs_dir_ent *ent;
-        bucket = (*ptr >> 16) - 1;
-        ecount = *ptr & 0xffff;
-        if ( bucket < 0 ) {
-                bucket = ecount = 0;
-        } 
-        DPRINTK(("autofs_hash_enum: bucket %d, entry %d\n", bucket, ecount));
-        ent = last ? last->next : NULL;
-        if ( ent ) {
-                ecount++;
-        } else {
-                while  ( bucket < AUTOFS_HASH_SIZE ) {
-                        ent = dh->h[bucket];
-                        for ( i = ecount ; ent && i ; i-- )
-                                ent = ent->next;
-                        
-                        if (ent) {
-                                ecount++; /* Point to *next* entry */
-                                break;
-                        }
-                        
-                        bucket++; ecount = 0;
-                }
-        }
-#ifdef DEBUG
-        if ( !ent )
-                printk("autofs_hash_enum: nothing found\n");
-        else {
-                printk("autofs_hash_enum: found hash %08x, name", ent->hash);
-                autofs_say(ent->name,ent->len);
-        }
-#endif
-        *ptr = ((bucket+1) << 16) + ecount;
-        return ent;
-}
-/* Iterate over all the ents, and remove all dentry pointers.  Used on
-   entering catatonic mode, in order to make the filesystem unmountable. */
-void autofs_hash_dputall(struct autofs_dirhash *dh)
-{
-        int i;
-        struct autofs_dir_ent *ent;
-        for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
-                for ( ent = dh->h[i] ; ent ; ent = ent->next ) {
-                        if ( ent->dentry ) {
-                                dput(ent->dentry);
-                                ent->dentry = NULL;
-                        }
-                }
-        }
-}
-/* Delete everything.  This is used on filesystem destruction, so we
-   make no attempt to keep the pointers valid */
-void autofs_hash_nuke(struct autofs_sb_info *sbi)
-{
-        int i;
-        struct autofs_dir_ent *ent, *nent;
-        for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
-                for ( ent = sbi->dirhash.h[i] ; ent ; ent = nent ) {
-                        nent = ent->next;
-                        if ( ent->dentry )
-                                dput(ent->dentry);
-                        kfree(ent->name);
-                        kfree(ent);
-                }
-        }
-}
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
deleted file mode 100644
index cea5219b4f37..000000000000
--- a/fs/autofs/init.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/init.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include "autofs_i.h"
-static int autofs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
-}
-static struct file_system_type autofs_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "autofs",
-        .get_sb         = autofs_get_sb,
-        .kill_sb        = autofs_kill_sb,
-};
-static int __init init_autofs_fs(void)
-{
-        return register_filesystem(&autofs_fs_type);
-}
-static void __exit exit_autofs_fs(void)
-{
-        unregister_filesystem(&autofs_fs_type);
-}
-module_init(init_autofs_fs);
-module_exit(exit_autofs_fs);
-#ifdef DEBUG
-void autofs_say(const char *name, int len)
-{
-        printk("(%d: ", len);
-        while ( len-- )
-                printk("%c", *name++);
-        printk(")\n");
-}
-#endif
-MODULE_LICENSE("GPL");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
deleted file mode 100644
index e1734f2d6e26..000000000000
--- a/fs/autofs/inode.c
+++ /dev/null
@@ -1,288 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/inode.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/file.h>
-#include <linux/parser.h>
-#include <linux/bitops.h>
-#include <linux/magic.h>
-#include "autofs_i.h"
-#include <linux/module.h>
-void autofs_kill_sb(struct super_block *sb)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(sb);
-        unsigned int n;
-        /*
-         * In the event of a failure in get_sb_nodev the superblock
-         * info is not present so nothing else has been setup, so
-         * just call kill_anon_super when we are called from
-         * deactivate_super.
-         */
-        if (!sbi)
-                goto out_kill_sb;
-        if (!sbi->catatonic)
-                autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
-        put_pid(sbi->oz_pgrp);
-        autofs_hash_nuke(sbi);
-        for (n = 0; n < AUTOFS_MAX_SYMLINKS; n++) {
-                if (test_bit(n, sbi->symlink_bitmap))
-                        kfree(sbi->symlink[n].data);
-        }
-        kfree(sb->s_fs_info);
-out_kill_sb:
-        DPRINTK(("autofs: shutting down\n"));
-        kill_anon_super(sb);
-}
-static const struct super_operations autofs_sops = {
-        .statfs         = simple_statfs,
-        .show_options   = generic_show_options,
-};
-enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
-static const match_table_t autofs_tokens = {
-        {Opt_fd, "fd=%u"},
-        {Opt_uid, "uid=%u"},
-        {Opt_gid, "gid=%u"},
-        {Opt_pgrp, "pgrp=%u"},
-        {Opt_minproto, "minproto=%u"},
-        {Opt_maxproto, "maxproto=%u"},
-        {Opt_err, NULL}
-};
-static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
-                pid_t *pgrp, int *minproto, int *maxproto)
-{
-        char *p;
-        substring_t args[MAX_OPT_ARGS];
-        int option;
-        *uid = current_uid();
-        *gid = current_gid();
-        *pgrp = task_pgrp_nr(current);
-        *minproto = *maxproto = AUTOFS_PROTO_VERSION;
-        *pipefd = -1;
-        if (!options)
-                return 1;
-        while ((p = strsep(&options, ",")) != NULL) {
-                int token;
-                if (!*p)
-                        continue;
-                token = match_token(p, autofs_tokens, args);
-                switch (token) {
-                case Opt_fd:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *pipefd = option;
-                        break;
-                case Opt_uid:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *uid = option;
-                        break;
-                case Opt_gid:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *gid = option;
-                        break;
-                case Opt_pgrp:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *pgrp = option;
-                        break;
-                case Opt_minproto:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *minproto = option;
-                        break;
-                case Opt_maxproto:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *maxproto = option;
-                        break;
-                default:
-                        return 1;
-                }
-        }
-        return (*pipefd < 0);
-}
-int autofs_fill_super(struct super_block *s, void *data, int silent)
-{
-        struct inode * root_inode;
-        struct dentry * root;
-        struct file * pipe;
-        int pipefd;
-        struct autofs_sb_info *sbi;
-        int minproto, maxproto;
-        pid_t pgid;
-        save_mount_options(s, data);
-        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-        if (!sbi)
-                goto fail_unlock;
-        DPRINTK(("autofs: starting up, sbi = %p\n",sbi));
-        s->s_fs_info = sbi;
-        sbi->magic = AUTOFS_SBI_MAGIC;
-        sbi->pipe = NULL;
-        sbi->catatonic = 1;
-        sbi->exp_timeout = 0;
-        autofs_initialize_hash(&sbi->dirhash);
-        sbi->queues = NULL;
-        memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN);
-        sbi->next_dir_ino = AUTOFS_FIRST_DIR_INO;
-        s->s_blocksize = 1024;
-        s->s_blocksize_bits = 10;
-        s->s_magic = AUTOFS_SUPER_MAGIC;
-        s->s_op = &autofs_sops;
-        s->s_time_gran = 1;
-        sbi->sb = s;
-        root_inode = autofs_iget(s, AUTOFS_ROOT_INO);
-        if (IS_ERR(root_inode))
-                goto fail_free;
-        root = d_alloc_root(root_inode);
-        pipe = NULL;
-        if (!root)
-                goto fail_iput;
-        /* Can this call block?  - WTF cares? s is locked. */
-        if (parse_options(data, &pipefd, &root_inode->i_uid,
-                                &root_inode->i_gid, &pgid, &minproto,
-                                &maxproto)) {
-                printk("autofs: called with bogus options\n");
-                goto fail_dput;
-        }
-        /* Couldn't this be tested earlier? */
-        if (minproto > AUTOFS_PROTO_VERSION ||
-             maxproto < AUTOFS_PROTO_VERSION) {
-                printk("autofs: kernel does not match daemon version\n");
-                goto fail_dput;
-        }
-        DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, pgid));
-        sbi->oz_pgrp = find_get_pid(pgid);
-        if (!sbi->oz_pgrp) {
-                printk("autofs: could not find process group %d\n", pgid);
-                goto fail_dput;
-        }
-        pipe = fget(pipefd);
-        
-        if (!pipe) {
-                printk("autofs: could not open pipe file descriptor\n");
-                goto fail_put_pid;
-        }
-        if (!pipe->f_op || !pipe->f_op->write)
-                goto fail_fput;
-        sbi->pipe = pipe;
-        sbi->catatonic = 0;
-        /*
-         * Success! Install the root dentry now to indicate completion.
-         */
-        s->s_root = root;
-        return 0;
-fail_fput:
-        printk("autofs: pipe file descriptor does not contain proper ops\n");
-        fput(pipe);
-fail_put_pid:
-        put_pid(sbi->oz_pgrp);
-fail_dput:
-        dput(root);
-        goto fail_free;
-fail_iput:
-        printk("autofs: get root dentry failed\n");
-        iput(root_inode);
-fail_free:
-        kfree(sbi);
-        s->s_fs_info = NULL;
-fail_unlock:
-        return -EINVAL;
-}
-struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
-{
-        unsigned int n;
-        struct autofs_sb_info *sbi = autofs_sbi(sb);
-        struct inode *inode;
-        inode = iget_locked(sb, ino);
-        if (!inode)
-                return ERR_PTR(-ENOMEM);
-        if (!(inode->i_state & I_NEW))
-                return inode;
-        /* Initialize to the default case (stub directory) */
-        inode->i_op = &simple_dir_inode_operations;
-        inode->i_fop = &simple_dir_operations;
-        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
-        inode->i_nlink = 2;
-        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        if (ino == AUTOFS_ROOT_INO) {
-                inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
-                inode->i_op = &autofs_root_inode_operations;
-                inode->i_fop = &autofs_root_operations;
-                goto done;
-        } 
-        
-        inode->i_uid = inode->i_sb->s_root->d_inode->i_uid;
-        inode->i_gid = inode->i_sb->s_root->d_inode->i_gid;
-        
-        if (ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO) {
-                /* Symlink inode - should be in symlink list */
-                struct autofs_symlink *sl;
-                n = ino - AUTOFS_FIRST_SYMLINK;
-                if (n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) {
-                        printk("autofs: Looking for bad symlink inode %u\n", (unsigned int) ino);
-                        goto done;
-                }
-                
-                inode->i_op = &autofs_symlink_inode_operations;
-                sl = &sbi->symlink[n];
-                inode->i_private = sl;
-                inode->i_mode = S_IFLNK | S_IRWXUGO;
-                inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime;
-                inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
-                inode->i_size = sl->len;
-                inode->i_nlink = 1;
-        }
-done:
-        unlock_new_inode(inode);
-        return inode;
-}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
deleted file mode 100644
index 0c4ca81aeaeb..000000000000
--- a/fs/autofs/root.c
+++ /dev/null
@@ -1,645 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/root.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/stat.h>
-#include <linux/slab.h>
-#include <linux/param.h>
-#include <linux/time.h>
-#include <linux/compat.h>
-#include <linux/smp_lock.h>
-#include "autofs_i.h"
-static int autofs_root_readdir(struct file *,void *,filldir_t);
-static struct dentry *autofs_root_lookup(struct inode *,struct dentry *, struct nameidata *);
-static int autofs_root_symlink(struct inode *,struct dentry *,const char *);
-static int autofs_root_unlink(struct inode *,struct dentry *);
-static int autofs_root_rmdir(struct inode *,struct dentry *);
-static int autofs_root_mkdir(struct inode *,struct dentry *,int);
-static long autofs_root_ioctl(struct file *,unsigned int,unsigned long);
-#ifdef CONFIG_COMPAT
-static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long);
-#endif
-const struct file_operations autofs_root_operations = {
-        .llseek         = generic_file_llseek,
-        .read           = generic_read_dir,
-        .readdir        = autofs_root_readdir,
-        .unlocked_ioctl = autofs_root_ioctl,
-#ifdef CONFIG_COMPAT
-        .compat_ioctl   = autofs_root_compat_ioctl,
-#endif
-};
-const struct inode_operations autofs_root_inode_operations = {
-        .lookup         = autofs_root_lookup,
-        .unlink         = autofs_root_unlink,
-        .symlink        = autofs_root_symlink,
-        .mkdir          = autofs_root_mkdir,
-        .rmdir          = autofs_root_rmdir,
-};
-static int autofs_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-        struct autofs_dir_ent *ent = NULL;
-        struct autofs_dirhash *dirhash;
-        struct autofs_sb_info *sbi;
-        struct inode * inode = filp->f_path.dentry->d_inode;
-        off_t onr, nr;
-        lock_kernel();
-        sbi = autofs_sbi(inode->i_sb);
-        dirhash = &sbi->dirhash;
-        nr = filp->f_pos;
-        switch(nr)
-        {
-        case 0:
-                if (filldir(dirent, ".", 1, nr, inode->i_ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos = ++nr;
-                /* fall through */
-        case 1:
-                if (filldir(dirent, "..", 2, nr, inode->i_ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos = ++nr;
-                /* fall through */
-        default:
-                while (onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent)) {
-                        if (!ent->dentry || d_mountpoint(ent->dentry)) {
-                                if (filldir(dirent,ent->name,ent->len,onr,ent->ino,DT_UNKNOWN) < 0)
-                                        goto out;
-                                filp->f_pos = nr;
-                        }
-                }
-                break;
-        }
-out:
-        unlock_kernel();
-        return 0;
-}
-static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, struct autofs_sb_info *sbi)
-{
-        struct inode * inode;
-        struct autofs_dir_ent *ent;
-        int status = 0;
-        if (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name))) {
-                do {
-                        if (status && dentry->d_inode) {
-                                if (status != -ENOENT)
-                                        printk("autofs warning: lookup failure on positive dentry, status = %d, name = %s\n", status, dentry->d_name.name);
-                                return 0; /* Try to get the kernel to invalidate this dentry */
-                        }
-                        /* Turn this into a real negative dentry? */
-                        if (status == -ENOENT) {
-                                dentry->d_time = jiffies + AUTOFS_NEGATIVE_TIMEOUT;
-                                dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-                                return 1;
-                        } else if (status) {
-                                /* Return a negative dentry, but leave it "pending" */
-                                return 1;
-                        }
-                        status = autofs_wait(sbi, &dentry->d_name);
-                } while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)));
-        }
-        /* Abuse this field as a pointer to the directory entry, used to
-           find the expire list pointers */
-        dentry->d_time = (unsigned long) ent;
-        
-        if (!dentry->d_inode) {
-                inode = autofs_iget(sb, ent->ino);
-                if (IS_ERR(inode)) {
-                        /* Failed, but leave pending for next time */
-                        return 1;
-                }
-                dentry->d_inode = inode;
-        }
-        /* If this is a directory that isn't a mount point, bitch at the
-           daemon and fix it in user space */
-        if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
-                return !autofs_wait(sbi, &dentry->d_name);
-        }
-        /* We don't update the usages for the autofs daemon itself, this
-           is necessary for recursive autofs mounts */
-        if (!autofs_oz_mode(sbi)) {
-                autofs_update_usage(&sbi->dirhash,ent);
-        }
-        dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-        return 1;
-}
-/*
- * Revalidate is called on every cache lookup.  Some of those
- * cache lookups may actually happen while the dentry is not
- * yet completely filled in, and revalidate has to delay such
- * lookups..
- */
-static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
-{
-        struct inode * dir;
-        struct autofs_sb_info *sbi;
-        struct autofs_dir_ent *ent;
-        int res;
-        lock_kernel();
-        dir = dentry->d_parent->d_inode;
-        sbi = autofs_sbi(dir->i_sb);
-        /* Pending dentry */
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
-                if (autofs_oz_mode(sbi))
-                        res = 1;
-                else
-                        res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
-                unlock_kernel();
-                return res;
-        }
-        /* Negative dentry.. invalidate if "old" */
-        if (!dentry->d_inode) {
-                unlock_kernel();
-                return (dentry->d_time - jiffies <= AUTOFS_NEGATIVE_TIMEOUT);
-        }
-                
-        /* Check for a non-mountpoint directory */
-        if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
-                if (autofs_oz_mode(sbi))
-                        res = 1;
-                else
-                        res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
-                unlock_kernel();
-                return res;
-        }
-        /* Update the usage list */
-        if (!autofs_oz_mode(sbi)) {
-                ent = (struct autofs_dir_ent *) dentry->d_time;
-                if (ent)
-                        autofs_update_usage(&sbi->dirhash,ent);
-        }
-        unlock_kernel();
-        return 1;
-}
-static const struct dentry_operations autofs_dentry_operations = {
-        .d_revalidate   = autofs_revalidate,
-};
-static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        struct autofs_sb_info *sbi;
-        int oz_mode;
-        DPRINTK(("autofs_root_lookup: name = "));
-        lock_kernel();
-        autofs_say(dentry->d_name.name,dentry->d_name.len);
-        if (dentry->d_name.len > NAME_MAX) {
-                unlock_kernel();
-                return ERR_PTR(-ENAMETOOLONG);/* File name too long to exist */
-        }
-        sbi = autofs_sbi(dir->i_sb);
-        oz_mode = autofs_oz_mode(sbi);
-        DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, "
-                                "oz_mode = %d\n", task_pid_nr(current),
-                                task_pgrp_nr(current), sbi->catatonic,
-                                oz_mode));
-        /*
-         * Mark the dentry incomplete, but add it. This is needed so
-         * that the VFS layer knows about the dentry, and we can count
-         * on catching any lookups through the revalidate.
-         *
-         * Let all the hard work be done by the revalidate function that
-         * needs to be able to do this anyway..
-         *
-         * We need to do this before we release the directory semaphore.
-         */
-        dentry->d_op = &autofs_dentry_operations;
-        dentry->d_flags |= DCACHE_AUTOFS_PENDING;
-        d_add(dentry, NULL);
-        mutex_unlock(&dir->i_mutex);
-        autofs_revalidate(dentry, nd);
-        mutex_lock(&dir->i_mutex);
-        /*
-         * If we are still pending, check if we had to handle
-         * a signal. If so we can force a restart..
-         */
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
-                /* See if we were interrupted */
-                if (signal_pending(current)) {
-                        sigset_t *sigset = &current->pending.signal;
-                        if (sigismember (sigset, SIGKILL) ||
-                            sigismember (sigset, SIGQUIT) ||
-                            sigismember (sigset, SIGINT)) {
-                                unlock_kernel();
-                                return ERR_PTR(-ERESTARTNOINTR);
-                        }
-                }
-        }
-        unlock_kernel();
-        /*
-         * If this dentry is unhashed, then we shouldn't honour this
-         * lookup even if the dentry is positive.  Returning ENOENT here
-         * doesn't do the right thing for all system calls, but it should
-         * be OK for the operations we permit from an autofs.
-         */
-        if (dentry->d_inode && d_unhashed(dentry))
-                return ERR_PTR(-ENOENT);
-        return NULL;
-}
-static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        unsigned int n;
-        int slsize;
-        struct autofs_symlink *sl;
-        struct inode *inode;
-        DPRINTK(("autofs_root_symlink: %s <- ", symname));
-        autofs_say(dentry->d_name.name,dentry->d_name.len);
-        lock_kernel();
-        if (!autofs_oz_mode(sbi)) {
-                unlock_kernel();
-                return -EACCES;
-        }
-        if (autofs_hash_lookup(dh, &dentry->d_name)) {
-                unlock_kernel();
-                return -EEXIST;
-        }
-        n = find_first_zero_bit(sbi->symlink_bitmap,AUTOFS_MAX_SYMLINKS);
-        if (n >= AUTOFS_MAX_SYMLINKS) {
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        set_bit(n,sbi->symlink_bitmap);
-        sl = &sbi->symlink[n];
-        sl->len = strlen(symname);
-        sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL);
-        if (!sl->data) {
-                clear_bit(n,sbi->symlink_bitmap);
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
-        if (!ent) {
-                kfree(sl->data);
-                clear_bit(n,sbi->symlink_bitmap);
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
-        if (!ent->name) {
-                kfree(sl->data);
-                kfree(ent);
-                clear_bit(n,sbi->symlink_bitmap);
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        memcpy(sl->data,symname,slsize);
-        sl->mtime = get_seconds();
-        ent->ino = AUTOFS_FIRST_SYMLINK + n;
-        ent->hash = dentry->d_name.hash;
-        memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
-        ent->dentry = NULL;     /* We don't keep the dentry for symlinks */
-        autofs_hash_insert(dh,ent);
-        inode = autofs_iget(dir->i_sb, ent->ino);
-        if (IS_ERR(inode))
-                return PTR_ERR(inode);
-        d_instantiate(dentry, inode);
-        unlock_kernel();
-        return 0;
-}
-/*
- * NOTE!
- *
- * Normal filesystems would do a "d_delete()" to tell the VFS dcache
- * that the file no longer exists. However, doing that means that the
- * VFS layer can turn the dentry into a negative dentry, which we
- * obviously do not want (we're dropping the entry not because it
- * doesn't exist, but because it has timed out).
- *
- * Also see autofs_root_rmdir()..
- */
-static int autofs_root_unlink(struct inode *dir, struct dentry *dentry)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        unsigned int n;
-        /* This allows root to remove symlinks */
-        lock_kernel();
-        if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
-                unlock_kernel();
-                return -EACCES;
-        }
-        ent = autofs_hash_lookup(dh, &dentry->d_name);
-        if (!ent) {
-                unlock_kernel();
-                return -ENOENT;
-        }
-        n = ent->ino - AUTOFS_FIRST_SYMLINK;
-        if (n >= AUTOFS_MAX_SYMLINKS) {
-                unlock_kernel();
-                return -EISDIR; /* It's a directory, dummy */
-        }
-        if (!test_bit(n,sbi->symlink_bitmap)) {
-                unlock_kernel();
-                return -EINVAL; /* Nonexistent symlink?  Shouldn't happen */
-        }
-        
-        dentry->d_time = (unsigned long)(struct autofs_dirhash *)NULL;
-        autofs_hash_delete(ent);
-        clear_bit(n,sbi->symlink_bitmap);
-        kfree(sbi->symlink[n].data);
-        d_drop(dentry);
-        
-        unlock_kernel();
-        return 0;
-}
-static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        lock_kernel();
-        if (!autofs_oz_mode(sbi)) {
-                unlock_kernel();
-                return -EACCES;
-        }
-        ent = autofs_hash_lookup(dh, &dentry->d_name);
-        if (!ent) {
-                unlock_kernel();
-                return -ENOENT;
-        }
-        if ((unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO) {
-                unlock_kernel();
-                return -ENOTDIR; /* Not a directory */
-        }
-        if (ent->dentry != dentry) {
-                printk("autofs_rmdir: odentry != dentry for entry %s\n", dentry->d_name.name);
-        }
-        dentry->d_time = (unsigned long)(struct autofs_dir_ent *)NULL;
-        autofs_hash_delete(ent);
-        drop_nlink(dir);
-        d_drop(dentry);
-        unlock_kernel();
-        return 0;
-}
-static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        struct inode *inode;
-        ino_t ino;
-        lock_kernel();
-        if (!autofs_oz_mode(sbi)) {
-                unlock_kernel();
-                return -EACCES;
-        }
-        ent = autofs_hash_lookup(dh, &dentry->d_name);
-        if (ent) {
-                unlock_kernel();
-                return -EEXIST;
-        }
-        if (sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO) {
-                printk("autofs: Out of inode numbers -- what the heck did you do??\n");
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ino = sbi->next_dir_ino++;
-        ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
-        if (!ent) {
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
-        if (!ent->name) {
-                kfree(ent);
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ent->hash = dentry->d_name.hash;
-        memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
-        ent->ino = ino;
-        ent->dentry = dentry;
-        autofs_hash_insert(dh,ent);
-        inc_nlink(dir);
-        inode = autofs_iget(dir->i_sb, ino);
-        if (IS_ERR(inode)) {
-                drop_nlink(dir);
-                return PTR_ERR(inode);
-        }
-        d_instantiate(dentry, inode);
-        unlock_kernel();
-        return 0;
-}
-/* Get/set timeout ioctl() operation */
-#ifdef CONFIG_COMPAT
-static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi,
-                                         unsigned int __user *p)
-{
-        unsigned long ntimeout;
-        if (get_user(ntimeout, p) ||
-            put_user(sbi->exp_timeout / HZ, p))
-                return -EFAULT;
-        if (ntimeout > UINT_MAX/HZ)
-                sbi->exp_timeout = 0;
-        else
-                sbi->exp_timeout = ntimeout * HZ;
-        return 0;
-}
-#endif
-static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
-                                         unsigned long __user *p)
-{
-        unsigned long ntimeout;
-        if (get_user(ntimeout, p) ||
-            put_user(sbi->exp_timeout / HZ, p))
-                return -EFAULT;
-        if (ntimeout > ULONG_MAX/HZ)
-                sbi->exp_timeout = 0;
-        else
-                sbi->exp_timeout = ntimeout * HZ;
-        return 0;
-}
-/* Return protocol version */
-static inline int autofs_get_protover(int __user *p)
-{
-        return put_user(AUTOFS_PROTO_VERSION, p);
-}
-/* Perform an expiry operation */
-static inline int autofs_expire_run(struct super_block *sb,
-                                    struct autofs_sb_info *sbi,
-                                    struct vfsmount *mnt,
-                                    struct autofs_packet_expire __user *pkt_p)
-{
-        struct autofs_dir_ent *ent;
-        struct autofs_packet_expire pkt;
-        memset(&pkt,0,sizeof pkt);
-        pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
-        pkt.hdr.type = autofs_ptype_expire;
-        if (!sbi->exp_timeout || !(ent = autofs_expire(sb,sbi,mnt)))
-                return -EAGAIN;
-        pkt.len = ent->len;
-        memcpy(pkt.name, ent->name, pkt.len);
-        pkt.name[pkt.len] = '\0';
-        if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
-                return -EFAULT;
-        return 0;
-}
-/*
- * ioctl()'s on the root directory is the chief method for the daemon to
- * generate kernel reactions
- */
-static int autofs_do_root_ioctl(struct inode *inode, struct file *filp,
-                             unsigned int cmd, unsigned long arg)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
-        void __user *argp = (void __user *)arg;
-        DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,task_pgrp_nr(current)));
-        if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
-             _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
-                return -ENOTTY;
-        
-        if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
-                return -EPERM;
-        
-        switch(cmd) {
-        case AUTOFS_IOC_READY:  /* Wait queue: go ahead and retry */
-                return autofs_wait_release(sbi,(autofs_wqt_t)arg,0);
-        case AUTOFS_IOC_FAIL:   /* Wait queue: fail with ENOENT */
-                return autofs_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
-        case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
-                autofs_catatonic_mode(sbi);
-                return 0;
-        case AUTOFS_IOC_PROTOVER: /* Get protocol version */
-                return autofs_get_protover(argp);
-#ifdef CONFIG_COMPAT
-        case AUTOFS_IOC_SETTIMEOUT32:
-                return autofs_compat_get_set_timeout(sbi, argp);
-#endif
-        case AUTOFS_IOC_SETTIMEOUT:
-                return autofs_get_set_timeout(sbi, argp);
-        case AUTOFS_IOC_EXPIRE:
-                return autofs_expire_run(inode->i_sb, sbi, filp->f_path.mnt,
-                                         argp);
-        default:
-                return -ENOSYS;
-        }
-}
-static long autofs_root_ioctl(struct file *filp,
-                             unsigned int cmd, unsigned long arg)
-{
-        int ret;
-        lock_kernel();
-        ret = autofs_do_root_ioctl(filp->f_path.dentry->d_inode,
-                                   filp, cmd, arg);
-        unlock_kernel();
-        return ret;
-}
-#ifdef CONFIG_COMPAT
-static long autofs_root_compat_ioctl(struct file *filp,
-                             unsigned int cmd, unsigned long arg)
-{
-        struct inode *inode = filp->f_path.dentry->d_inode;
-        int ret;
-        lock_kernel();
-        if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
-                ret = autofs_do_root_ioctl(inode, filp, cmd, arg);
-        else
-                ret = autofs_do_root_ioctl(inode, filp, cmd,
-                        (unsigned long)compat_ptr(arg));
-        unlock_kernel();
-        return ret;
-}
-#endif
diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c
deleted file mode 100644
index 7ce9cb2c9ce2..000000000000
--- a/fs/autofs/symlink.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/symlink.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include "autofs_i.h"
-/* Nothing to release.. */
-static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-        char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data;
-        nd_set_link(nd, s);
-        return NULL;
-}
-const struct inode_operations autofs_symlink_inode_operations = {
-        .readlink       = generic_readlink,
-        .follow_link    = autofs_follow_link
-};
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
deleted file mode 100644
index be46805972f0..000000000000
--- a/fs/autofs/waitq.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/waitq.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/signal.h>
-#include <linux/file.h>
-#include "autofs_i.h"
-/* We make this a static variable rather than a part of the superblock; it
-   is better if we don't reassign numbers easily even across filesystems */
-static autofs_wqt_t autofs_next_wait_queue = 1;
-/* These are the signals we allow interrupting a pending mount */
-#define SHUTDOWN_SIGS   (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT))
-void autofs_catatonic_mode(struct autofs_sb_info *sbi)
-{
-        struct autofs_wait_queue *wq, *nwq;
-        DPRINTK(("autofs: entering catatonic mode\n"));
-        sbi->catatonic = 1;
-        wq = sbi->queues;
-        sbi->queues = NULL;     /* Erase all wait queues */
-        while ( wq ) {
-                nwq = wq->next;
-                wq->status = -ENOENT; /* Magic is gone - report failure */
-                kfree(wq->name);
-                wq->name = NULL;
-                wake_up(&wq->queue);
-                wq = nwq;
-        }
-        fput(sbi->pipe);        /* Close the pipe */
-        sbi->pipe = NULL;
-        autofs_hash_dputall(&sbi->dirhash); /* Remove all dentry pointers */
-}
-static int autofs_write(struct file *file, const void *addr, int bytes)
-{
-        unsigned long sigpipe, flags;
-        mm_segment_t fs;
-        const char *data = (const char *)addr;
-        ssize_t wr = 0;
-        /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
-        sigpipe = sigismember(&current->pending.signal, SIGPIPE);
-        /* Save pointer to user space and point back to kernel space */
-        fs = get_fs();
-        set_fs(KERNEL_DS);
-        while (bytes &&
-               (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
-                data += wr;
-                bytes -= wr;
-        }
-        set_fs(fs);
-        /* Keep the currently executing process from receiving a
-           SIGPIPE unless it was already supposed to get one */
-        if (wr == -EPIPE && !sigpipe) {
-                spin_lock_irqsave(&current->sighand->siglock, flags);
-                sigdelset(&current->pending.signal, SIGPIPE);
-                recalc_sigpending();
-                spin_unlock_irqrestore(&current->sighand->siglock, flags);
-        }
-        return (bytes > 0);
-}
-        
-static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq)
-{
-        struct autofs_packet_missing pkt;
-        DPRINTK(("autofs_wait: wait id = 0x%08lx, name = ", wq->wait_queue_token));
-        autofs_say(wq->name,wq->len);
-        memset(&pkt,0,sizeof pkt); /* For security reasons */
-        pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
-        pkt.hdr.type = autofs_ptype_missing;
-        pkt.wait_queue_token = wq->wait_queue_token;
-        pkt.len = wq->len;
-        memcpy(pkt.name, wq->name, pkt.len);
-        pkt.name[pkt.len] = '\0';
-        if ( autofs_write(sbi->pipe,&pkt,sizeof(struct autofs_packet_missing)) )
-                autofs_catatonic_mode(sbi);
-}
-int autofs_wait(struct autofs_sb_info *sbi, struct qstr *name)
-{
-        struct autofs_wait_queue *wq;
-        int status;
-        /* In catatonic mode, we don't wait for nobody */
-        if ( sbi->catatonic )
-                return -ENOENT;
-        
-        /* We shouldn't be able to get here, but just in case */
-        if ( name->len > NAME_MAX )
-                return -ENOENT;
-        for ( wq = sbi->queues ; wq ; wq = wq->next ) {
-                if ( wq->hash == name->hash &&
-                     wq->len == name->len &&
-                     wq->name && !memcmp(wq->name,name->name,name->len) )
-                        break;
-        }
-        
-        if ( !wq ) {
-                /* Create a new wait queue */
-                wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
-                if ( !wq )
-                        return -ENOMEM;
-                wq->name = kmalloc(name->len,GFP_KERNEL);
-                if ( !wq->name ) {
-                        kfree(wq);
-                        return -ENOMEM;
-                }
-                wq->wait_queue_token = autofs_next_wait_queue++;
-                init_waitqueue_head(&wq->queue);
-                wq->hash = name->hash;
-                wq->len = name->len;
-                wq->status = -EINTR; /* Status return if interrupted */
-                memcpy(wq->name, name->name, name->len);
-                wq->next = sbi->queues;
-                sbi->queues = wq;
-                /* autofs_notify_daemon() may block */
-                wq->wait_ctr = 2;
-                autofs_notify_daemon(sbi,wq);
-        } else
-                wq->wait_ctr++;
-        /* wq->name is NULL if and only if the lock is already released */
-        if ( sbi->catatonic ) {
-                /* We might have slept, so check again for catatonic mode */
-                wq->status = -ENOENT;
-                kfree(wq->name);
-                wq->name = NULL;
-        }
-        if ( wq->name ) {
-                /* Block all but "shutdown" signals while waiting */
-                sigset_t sigmask;
-                siginitsetinv(&sigmask, SHUTDOWN_SIGS);
-                sigprocmask(SIG_BLOCK, &sigmask, &sigmask);
-                interruptible_sleep_on(&wq->queue);
-                sigprocmask(SIG_SETMASK, &sigmask, NULL);
-        } else {
-                DPRINTK(("autofs_wait: skipped sleeping\n"));
-        }
-        status = wq->status;
-        if ( ! --wq->wait_ctr ) /* Are we the last process to need status? */
-                kfree(wq);
-        return status;
-}
-int autofs_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
-{
-        struct autofs_wait_queue *wq, **wql;
-        for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
-                if ( wq->wait_queue_token == wait_queue_token )
-                        break;
-        }
-        if ( !wq )
-                return -EINVAL;
-        *wql = wq->next;        /* Unlink from chain */
-        kfree(wq->name);
-        wq->name = NULL;        /* Do not wait on this queue */
-        wq->status = status;
-        if ( ! --wq->wait_ctr ) /* Is anyone still waiting for this guy? */
-                kfree(wq);
-        else
-                wake_up(&wq->queue);
-        return 0;
-}
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 9722e4bd8957..c038727b4050 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,16 +14,16 @@
 #include <linux/init.h>
 #include "autofs_i.h"
-static int autofs_get_sb(struct file_system_type *fs_type,
+static struct dentry *autofs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, autofs4_fill_super);
 }
 static struct file_system_type autofs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "autofs",
-        .get_sb         = autofs_get_sb,
+        .mount          = autofs_mount,
        .kill_sb        = autofs4_kill_sb,
 };
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 821b2b955dac..ac87e49fa706 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -398,6 +398,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
                inode->i_gid = sb->s_root->d_inode->i_gid;
        }
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        inode->i_ino = get_next_ino();
        if (S_ISDIR(inf->mode)) {
                inode->i_nlink = 2;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index dc39d2824885..aa4e7c7ae3c6 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -913,18 +913,17 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static int
+static struct dentry *
-befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name,
+befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name,
-            void *data, struct vfsmount *mnt)
+            void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super);
-                           mnt);
 }
 static struct file_system_type befs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "befs",
-        .get_sb         = befs_get_sb,
+        .mount          = befs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,      
 };
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d967e052b779..685ecff3ab31 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
        inc_nlink(inode);
        inode->i_ctime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(new, inode);
        mutex_unlock(&info->bfs_lock);
        return 0;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 883e77acd5a8..76db6d7d49bb 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -450,16 +450,16 @@ out:
        return ret;
 }
-static int bfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *bfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
 }
 static struct file_system_type bfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "bfs",
-        .get_sb         = bfs_get_sb,
+        .mount          = bfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 139fc8083f53..1befe2ec8186 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -495,6 +495,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
        struct inode * inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_atime = inode->i_mtime = inode->i_ctime =
                        current_fs_time(inode->i_sb);
@@ -705,10 +706,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
        return err;
 }
-static int bm_get_sb(struct file_system_type *fs_type,
+static struct dentry *bm_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, bm_fill_super, mnt);
+        return mount_single(fs_type, flags, data, bm_fill_super);
 }
 static struct linux_binfmt misc_format = {
@@ -719,7 +720,7 @@ static struct linux_binfmt misc_format = {
 static struct file_system_type bm_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "binfmt_misc",
-        .get_sb         = bm_get_sb,
+        .mount          = bm_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b737451e2e9d..06e8ff12b97c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -48,6 +48,21 @@ inline struct block_device *I_BDEV(struct inode *inode)
 EXPORT_SYMBOL(I_BDEV);
+/*
+ * move the inode from it's current bdi to the a new bdi. if the inode is dirty
+ * we need to move it onto the dirty list of @dst so that the inode is always
+ * on the right list.
+ */
+static void bdev_inode_switch_bdi(struct inode *inode,
+                        struct backing_dev_info *dst)
+{
+        spin_lock(&inode_lock);
+        inode->i_data.backing_dev_info = dst;
+        if (inode->i_state & I_DIRTY)
+                list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+        spin_unlock(&inode_lock);
+}
 static sector_t max_block(struct block_device *bdev)
 {
        sector_t retval = ~((sector_t)0);
@@ -449,15 +464,15 @@ static const struct super_operations bdev_sops = {
        .evict_inode = bdev_evict_inode,
 };
-static int bd_get_sb(struct file_system_type *fs_type,
+static struct dentry *bd_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
+        return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
 }
 static struct file_system_type bd_type = {
        .name           = "bdev",
-        .get_sb         = bd_get_sb,
+        .mount          = bd_mount,
        .kill_sb        = kill_anon_super,
 };
@@ -550,7 +565,7 @@ EXPORT_SYMBOL(bdget);
 */
 struct block_device *bdgrab(struct block_device *bdev)
 {
-        atomic_inc(&bdev->bd_inode->i_count);
+        ihold(bdev->bd_inode);
        return bdev;
 }
@@ -580,7 +595,7 @@ static struct block_device *bd_acquire(struct inode *inode)
        spin_lock(&bdev_lock);
        bdev = inode->i_bdev;
        if (bdev) {
-                atomic_inc(&bdev->bd_inode->i_count);
+                ihold(bdev->bd_inode);
                spin_unlock(&bdev_lock);
                return bdev;
        }
@@ -591,12 +606,12 @@ static struct block_device *bd_acquire(struct inode *inode)
                spin_lock(&bdev_lock);
                if (!inode->i_bdev) {
                        /*
-                         * We take an additional bd_inode->i_count for inode,
+                         * We take an additional reference to bd_inode,
                         * and it's released in clear_inode() of inode.
                         * So, we can access it via ->i_mapping always
                         * without igrab().
                         */
-                        atomic_inc(&bdev->bd_inode->i_count);
+                        ihold(bdev->bd_inode);
                        inode->i_bdev = bdev;
                        inode->i_mapping = bdev->bd_inode->i_mapping;
                        list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -1390,7 +1405,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                bdi = blk_get_backing_dev_info(bdev);
                                if (bdi == NULL)
                                        bdi = &default_backing_dev_info;
-                                bdev->bd_inode->i_data.backing_dev_info = bdi;
+                                bdev_inode_switch_bdi(bdev->bd_inode, bdi);
                        }
                        if (bdev->bd_invalidated)
                                rescan_partitions(disk, bdev);
@@ -1405,8 +1420,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        if (ret)
                                goto out_clear;
                        bdev->bd_contains = whole;
-                        bdev->bd_inode->i_data.backing_dev_info =
+                        bdev_inode_switch_bdi(bdev->bd_inode,
-                           whole->bd_inode->i_data.backing_dev_info;
+                                whole->bd_inode->i_data.backing_dev_info);
                        bdev->bd_part = disk_get_part(disk, partno);
                        if (!(disk->flags & GENHD_FL_UP) ||
                            !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1439,7 +1454,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        disk_put_part(bdev->bd_part);
        bdev->bd_disk = NULL;
        bdev->bd_part = NULL;
-        bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+        bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
        if (bdev != bdev->bd_contains)
                __blkdev_put(bdev->bd_contains, mode, 1);
        bdev->bd_contains = NULL;
@@ -1533,7 +1548,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                disk_put_part(bdev->bd_part);
                bdev->bd_part = NULL;
                bdev->bd_disk = NULL;
-                bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+                bdev_inode_switch_bdi(bdev->bd_inode,
+                                        &default_backing_dev_info);
                if (bdev != bdev->bd_contains)
                        victim = bdev->bd_contains;
                bdev->bd_contains = NULL;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c03864406af3..64f99cf69ce0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3849,7 +3849,7 @@ again:
        p = &root->inode_tree.rb_node;
        parent = NULL;
-        if (hlist_unhashed(&inode->i_hash))
+        if (inode_unhashed(inode))
                return;
        spin_lock(&root->inode_lock);
@@ -4758,7 +4758,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        }
        btrfs_set_trans_block_group(trans, dir);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = btrfs_add_nondir(trans, dentry, inode, 1, index);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 144f8a5730f5..ebe46c628748 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -560,8 +560,8 @@ static int btrfs_test_super(struct super_block *s, void *data)
 * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
 *        for multiple device setup.  Make sure to keep it in sync.
 */
-static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
-                const char *dev_name, void *data, struct vfsmount *mnt)
+                const char *dev_name, void *data)
 {
        struct block_device *bdev = NULL;
        struct super_block *s;
@@ -580,7 +580,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                                          &subvol_name, &subvol_objectid,
                                          &fs_devices);
        if (error)
-                return error;
+                return ERR_PTR(error);
        error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
        if (error)
@@ -656,11 +656,8 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                root = new_root;
        }
-        mnt->mnt_sb = s;
-        mnt->mnt_root = root;
        kfree(subvol_name);
-        return 0;
+        return root;
 error_s:
        error = PTR_ERR(s);
@@ -669,7 +666,7 @@ error_close_devices:
 error_free_subvol_name:
        kfree(subvol_name);
 error:
-        return error;
+        return ERR_PTR(error);
 }
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -746,7 +743,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static struct file_system_type btrfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "btrfs",
-        .get_sb         = btrfs_get_sb,
+        .mount          = btrfs_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/buffer.c b/fs/buffer.c
index 7f0b9b083f77..5930e382959b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -905,7 +905,6 @@ try_again:
                bh->b_state = 0;
                atomic_set(&bh->b_count, 0);
-                bh->b_private = NULL;
                bh->b_size = size;
                /* Link the buffer to its page */
@@ -1706,7 +1705,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
                 * and kswapd activity, but those code paths have their own
                 * higher-level throttling.
                 */
-                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else if (!trylock_buffer(bh)) {
                        redirty_page_for_writepage(wbc, page);
@@ -1834,9 +1833,11 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
-int block_prepare_write(struct page *page, unsigned from, unsigned to,
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block)
 {
+        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+        unsigned to = from + len;
        struct inode *inode = page->mapping->host;
        unsigned block_start, block_end;
        sector_t block;
@@ -1916,7 +1917,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
        }
        return err;
 }
-EXPORT_SYMBOL(block_prepare_write);
+EXPORT_SYMBOL(__block_write_begin);
 static int __block_commit_write(struct inode *inode, struct page *page,
                unsigned from, unsigned to)
@@ -1953,15 +1954,6 @@ static int __block_commit_write(struct inode *inode, struct page *page,
        return 0;
 }
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-                get_block_t *get_block)
-{
-        unsigned start = pos & (PAGE_CACHE_SIZE - 1);
-        return block_prepare_write(page, start, start + len, get_block);
-}
-EXPORT_SYMBOL(__block_write_begin);
 /*
 * block_write_begin takes care of the basic task of block allocation and
 * bringing partial write blocks uptodate first.
@@ -2379,7 +2371,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        else
                end = PAGE_CACHE_SIZE;
-        ret = block_prepare_write(page, 0, end, get_block);
+        ret = __block_write_begin(page, 0, end, get_block);
        if (!ret)
                ret = block_commit_write(page, 0, end);
@@ -2466,11 +2458,10 @@ int nobh_write_begin(struct address_space *mapping,
        *fsdata = NULL;
        if (page_has_buffers(page)) {
-                unlock_page(page);
+                ret = __block_write_begin(page, pos, len, get_block);
-                page_cache_release(page);
+                if (unlikely(ret))
-                *pagep = NULL;
+                        goto out_release;
-                return block_write_begin(mapping, pos, len, flags, pagep,
+                return ret;
-                                         get_block);
        }
        if (PageMappedToDisk(page))
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 51bcc5ce3230..e9c874abc9e1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -591,7 +591,6 @@ static int ceph_writepages_start(struct address_space *mapping,
                                 struct writeback_control *wbc)
 {
        struct inode *inode = mapping->host;
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc;
        pgoff_t index, start, end;
@@ -633,13 +632,6 @@ static int ceph_writepages_start(struct address_space *mapping,
        pagevec_init(&pvec, 0);
-        /* ?? */
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                dout(" writepages congested\n");
-                wbc->encountered_congestion = 1;
-                goto out_final;
-        }
        /* where to start/end? */
        if (wbc->range_cyclic) {
                start = mapping->writeback_index; /* Start from prev offset */
@@ -885,7 +877,6 @@ out:
                rc = 0;  /* vfs expects us to return 0 */
        ceph_put_snap_context(snapc);
        dout("writepages done, rc = %d\n", rc);
-out_final:
        return rc;
 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d6e0e0421891..08b460ae0539 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -635,7 +635,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 /*
 * mount: join the ceph cluster, and open root directory.
 */
-static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
                      const char *path)
 {
        int err;
@@ -678,16 +678,14 @@ static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
                }
        }
-        mnt->mnt_root = root;
-        mnt->mnt_sb = fsc->sb;
        fsc->mount_state = CEPH_MOUNT_MOUNTED;
        dout("mount success\n");
-        err = 0;
+        mutex_unlock(&fsc->client->mount_mutex);
+        return root;
 out:
        mutex_unlock(&fsc->client->mount_mutex);
-        return err;
+        return ERR_PTR(err);
 fail:
        if (first) {
@@ -777,41 +775,45 @@ static int ceph_register_bdi(struct super_block *sb,
        return err;
 }
-static int ceph_get_sb(struct file_system_type *fs_type,
+static struct dentry *ceph_mount(struct file_system_type *fs_type,
-                       int flags, const char *dev_name, void *data,
+                       int flags, const char *dev_name, void *data)
-                       struct vfsmount *mnt)
 {
        struct super_block *sb;
        struct ceph_fs_client *fsc;
+        struct dentry *res;
        int err;
        int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
        const char *path = NULL;
        struct ceph_mount_options *fsopt = NULL;
        struct ceph_options *opt = NULL;
-        dout("ceph_get_sb\n");
+        dout("ceph_mount\n");
        err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
-        if (err < 0)
+        if (err < 0) {
+                res = ERR_PTR(err);
                goto out_final;
+        }
        /* create client (which we may/may not use) */
        fsc = create_fs_client(fsopt, opt);
        if (IS_ERR(fsc)) {
-                err = PTR_ERR(fsc);
+                res = ERR_CAST(fsc);
                kfree(fsopt);
                kfree(opt);
                goto out_final;
        }
        err = ceph_mdsc_init(fsc);
-        if (err < 0)
+        if (err < 0) {
+                res = ERR_PTR(err);
                goto out;
+        }
        if (ceph_test_opt(fsc->client, NOSHARE))
                compare_super = NULL;
        sb = sget(fs_type, compare_super, ceph_set_super, fsc);
        if (IS_ERR(sb)) {
-                err = PTR_ERR(sb);
+                res = ERR_CAST(sb);
                goto out;
        }
@@ -823,16 +825,18 @@ static int ceph_get_sb(struct file_system_type *fs_type,
        } else {
                dout("get_sb using new client %p\n", fsc);
                err = ceph_register_bdi(sb, fsc);
-                if (err < 0)
+                if (err < 0) {
+                        res = ERR_PTR(err);
                        goto out_splat;
+                }
        }
-        err = ceph_mount(fsc, mnt, path);
+        res = ceph_real_mount(fsc, path);
-        if (err < 0)
+        if (IS_ERR(res))
                goto out_splat;
-        dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
+        dout("root %p inode %p ino %llx.%llx\n", res,
-             mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
+             res->d_inode, ceph_vinop(res->d_inode));
-        return 0;
+        return res;
 out_splat:
        ceph_mdsc_close_sessions(fsc->mdsc);
@@ -843,8 +847,8 @@ out:
        ceph_mdsc_destroy(fsc);
        destroy_fs_client(fsc);
 out_final:
-        dout("ceph_get_sb fail %d\n", err);
+        dout("ceph_mount fail %ld\n", PTR_ERR(res));
-        return err;
+        return res;
 }
 static void ceph_kill_sb(struct super_block *s)
@@ -860,7 +864,7 @@ static void ceph_kill_sb(struct super_block *s)
 static struct file_system_type ceph_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ceph",
-        .get_sb         = ceph_get_sb,
+        .mount          = ceph_mount,
        .kill_sb        = ceph_kill_sb,
        .fs_flags       = FS_RENAME_DOES_D_MOVE,
 };
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 917b7d449bb2..0ed213970ced 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,9 @@ config CIFS
        tristate "CIFS support (advanced network filesystem, SMBFS successor)"
        depends on INET
        select NLS
+        select CRYPTO
+        select CRYPTO_MD5
+        select CRYPTO_ARC4
        help
          This is the client VFS module for the Common Internet File System
          (CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7ac0056294cf..f856732161ab 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -43,18 +43,32 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
                       unsigned char *p24);
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
-                                const struct session_key *key, char *signature)
+                                struct TCP_Server_Info *server, char *signature)
 {
-        struct  MD5Context context;
+        int rc;
-        if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
+        if (cifs_pdu == NULL || signature == NULL || server == NULL)
                return -EINVAL;
-        cifs_MD5_init(&context);
+        if (!server->secmech.sdescmd5) {
-        cifs_MD5_update(&context, (char *)&key->data, key->len);
+                cERROR(1, "%s: Can't generate signature\n", __func__);
-        cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+                return -1;
+        }
+        rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+        if (rc) {
+                cERROR(1, "%s: Oould not init md5\n", __func__);
+                return rc;
+        }
+        crypto_shash_update(&server->secmech.sdescmd5->shash,
+                server->session_key.response, server->session_key.len);
+        crypto_shash_update(&server->secmech.sdescmd5->shash,
+                cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+        rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
-        cifs_MD5_final(signature, &context);
        return 0;
 }
@@ -79,8 +93,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
        server->sequence_number++;
        spin_unlock(&GlobalMid_Lock);
-        rc = cifs_calculate_signature(cifs_pdu, &server->session_key,
+        rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
-                                      smb_signature);
        if (rc)
                memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
        else
@@ -90,16 +103,28 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 }
 static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
-                                const struct session_key *key, char *signature)
+                                struct TCP_Server_Info *server, char *signature)
 {
-        struct  MD5Context context;
        int i;
+        int rc;
-        if ((iov == NULL) || (signature == NULL) || (key == NULL))
+        if (iov == NULL || signature == NULL || server == NULL)
                return -EINVAL;
-        cifs_MD5_init(&context);
+        if (!server->secmech.sdescmd5) {
-        cifs_MD5_update(&context, (char *)&key->data, key->len);
+                cERROR(1, "%s: Can't generate signature\n", __func__);
+                return -1;
+        }
+        rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+        if (rc) {
+                cERROR(1, "%s: Oould not init md5\n", __func__);
+                return rc;
+        }
+        crypto_shash_update(&server->secmech.sdescmd5->shash,
+                server->session_key.response, server->session_key.len);
        for (i = 0; i < n_vec; i++) {
                if (iov[i].iov_len == 0)
                        continue;
@@ -112,18 +137,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
                if (i == 0) {
                        if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
                                break; /* nothing to sign or corrupt header */
-                        cifs_MD5_update(&context, iov[0].iov_base+4,
+                        crypto_shash_update(&server->secmech.sdescmd5->shash,
-                                  iov[0].iov_len-4);
+                                iov[i].iov_base + 4, iov[i].iov_len - 4);
                } else
-                        cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
+                        crypto_shash_update(&server->secmech.sdescmd5->shash,
+                                iov[i].iov_base, iov[i].iov_len);
        }
-        cifs_MD5_final(signature, &context);
+        rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
-        return 0;
+        return rc;
 }
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
                   __u32 *pexpected_response_sequence_number)
 {
@@ -146,8 +171,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
        server->sequence_number++;
        spin_unlock(&GlobalMid_Lock);
-        rc = cifs_calc_signature2(iov, n_vec, &server->session_key,
+        rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
-                                      smb_signature);
        if (rc)
                memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
        else
@@ -157,14 +181,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 }
 int cifs_verify_signature(struct smb_hdr *cifs_pdu,
-                          const struct session_key *session_key,
+                          struct TCP_Server_Info *server,
                          __u32 expected_sequence_number)
 {
        unsigned int rc;
        char server_response_sig[8];
        char what_we_think_sig_should_be[20];
-        if (cifs_pdu == NULL || session_key == NULL)
+        if (cifs_pdu == NULL || server == NULL)
                return -EINVAL;
        if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -193,7 +217,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
                                        cpu_to_le32(expected_sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
-        rc = cifs_calculate_signature(cifs_pdu, session_key,
+        rc = cifs_calculate_signature(cifs_pdu, server,
                what_we_think_sig_should_be);
        if (rc)
@@ -209,18 +233,28 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 }
-/* We fill in key by putting in 40 byte array which was allocated by caller */
+/* first calculate 24 bytes ntlm response and then 16 byte session key */
-int cifs_calculate_session_key(struct session_key *key, const char *rn,
+int setup_ntlm_response(struct cifsSesInfo *ses)
-                           const char *password)
 {
-        char temp_key[16];
+        unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
-        if ((key == NULL) || (rn == NULL))
+        char temp_key[CIFS_SESS_KEY_SIZE];
+        if (!ses)
                return -EINVAL;
-        E_md4hash(password, temp_key);
+        ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
-        mdfour(key->data.ntlm, temp_key, 16);
+        if (!ses->auth_key.response) {
-        memcpy(key->data.ntlm+16, rn, CIFS_SESS_KEY_SIZE);
+                cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len);
-        key->len = 40;
+                return -ENOMEM;
+        }
+        ses->auth_key.len = temp_len;
+        SMBNTencrypt(ses->password, ses->server->cryptkey,
+                        ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+        E_md4hash(ses->password, temp_key);
+        mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
        return 0;
 }
@@ -294,15 +328,15 @@ build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
         * two times the unicode length of a server name +
         * size of a timestamp (which is 8 bytes).
         */
-        ses->tilen = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
+        ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
-        ses->tiblob = kzalloc(ses->tilen, GFP_KERNEL);
+        ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
-        if (!ses->tiblob) {
+        if (!ses->auth_key.response) {
-                ses->tilen = 0;
+                ses->auth_key.len = 0;
                cERROR(1, "Challenge target info allocation failure");
                return -ENOMEM;
        }
-        blobptr = ses->tiblob;
+        blobptr = ses->auth_key.response;
        attrptr = (struct ntlmssp2_name *) blobptr;
        attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
@@ -357,7 +391,7 @@ build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 * about target string i.e. for some, just user name might suffice.
 */
 static int
-find_domain_name(struct cifsSesInfo *ses)
+find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 {
        unsigned int attrsize;
        unsigned int type;
@@ -366,11 +400,11 @@ find_domain_name(struct cifsSesInfo *ses)
        unsigned char *blobend;
        struct ntlmssp2_name *attrptr;
-        if (!ses->tilen || !ses->tiblob)
+        if (!ses->auth_key.len || !ses->auth_key.response)
                return 0;
-        blobptr = ses->tiblob;
+        blobptr = ses->auth_key.response;
-        blobend = ses->tiblob + ses->tilen;
+        blobend = blobptr + ses->auth_key.len;
        while (blobptr + onesize < blobend) {
                attrptr = (struct ntlmssp2_name *) blobptr;
@@ -386,16 +420,13 @@ find_domain_name(struct cifsSesInfo *ses)
                        if (!attrsize)
                                break;
                        if (!ses->domainName) {
-                                struct nls_table *default_nls;
                                ses->domainName =
                                        kmalloc(attrsize + 1, GFP_KERNEL);
                                if (!ses->domainName)
                                                return -ENOMEM;
-                                default_nls = load_nls_default();
                                cifs_from_ucs2(ses->domainName,
                                        (__le16 *)blobptr, attrsize, attrsize,
-                                        default_nls, false);
+                                        nls_cp, false);
-                                unload_nls(default_nls);
                                break;
                        }
                }
@@ -405,82 +436,136 @@ find_domain_name(struct cifsSesInfo *ses)
        return 0;
 }
-static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
+static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
                            const struct nls_table *nls_cp)
 {
        int rc = 0;
        int len;
-        char nt_hash[16];
+        char nt_hash[CIFS_NTHASH_SIZE];
-        struct HMACMD5Context *pctxt;
        wchar_t *user;
        wchar_t *domain;
+        wchar_t *server;
-        pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL);
+        if (!ses->server->secmech.sdeschmacmd5) {
+                cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
-        if (pctxt == NULL)
+                return -1;
-                return -ENOMEM;
+        }
        /* calculate md4 hash of password */
        E_md4hash(ses->password, nt_hash);
-        /* convert Domainname to unicode and uppercase */
+        crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
-        hmac_md5_init_limK_to_64(nt_hash, 16, pctxt);
+                                CIFS_NTHASH_SIZE);
+        rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+        if (rc) {
+                cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
+                return rc;
+        }
        /* convert ses->userName to unicode and uppercase */
        len = strlen(ses->userName);
        user = kmalloc(2 + (len * 2), GFP_KERNEL);
-        if (user == NULL)
+        if (user == NULL) {
+                cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
+                rc = -ENOMEM;
                goto calc_exit_2;
+        }
        len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
        UniStrupr(user);
-        hmac_md5_update((char *)user, 2*len, pctxt);
+        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+                                (char *)user, 2 * len);
        /* convert ses->domainName to unicode and uppercase */
        if (ses->domainName) {
                len = strlen(ses->domainName);
                domain = kmalloc(2 + (len * 2), GFP_KERNEL);
-                if (domain == NULL)
+                if (domain == NULL) {
+                        cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
+                        rc = -ENOMEM;
                        goto calc_exit_1;
+                }
                len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
                                        nls_cp);
-                /* the following line was removed since it didn't work well
+                crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
-                   with lower cased domain name that passed as an option.
+                                        (char *)domain, 2 * len);
-                   Maybe converting the domain name earlier makes sense */
-                /* UniStrupr(domain); */
-                hmac_md5_update((char *)domain, 2*len, pctxt);
                kfree(domain);
+        } else if (ses->serverName) {
+                len = strlen(ses->serverName);
+                server = kmalloc(2 + (len * 2), GFP_KERNEL);
+                if (server == NULL) {
+                        cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
+                        rc = -ENOMEM;
+                        goto calc_exit_1;
+                }
+                len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+                                        nls_cp);
+                crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+                                        (char *)server, 2 * len);
+                kfree(server);
        }
+        rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+                                        ntlmv2_hash);
 calc_exit_1:
        kfree(user);
 calc_exit_2:
-        /* BB FIXME what about bytes 24 through 40 of the signing key?
+        return rc;
-           compare with the NTLM example */
+}
-        hmac_md5_final(ses->ntlmv2_hash, pctxt);
+static int
+CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash)
+{
+        int rc;
+        unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
+        if (!ses->server->secmech.sdeschmacmd5) {
+                cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+                return -1;
+        }
+        crypto_shash_setkey(ses->server->secmech.hmacmd5,
+                                ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+        rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+        if (rc) {
+                cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
+                return rc;
+        }
+        if (ses->server->secType == RawNTLMSSP)
+                memcpy(ses->auth_key.response + offset,
+                        ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+        else
+                memcpy(ses->auth_key.response + offset,
+                        ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+                ses->auth_key.response + offset, ses->auth_key.len - offset);
+        rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+                ses->auth_key.response + CIFS_SESS_KEY_SIZE);
-        kfree(pctxt);
        return rc;
 }
 int
-setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
+setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
-                      const struct nls_table *nls_cp)
 {
        int rc;
-        struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf;
+        int baselen;
-        struct HMACMD5Context context;
+        unsigned int tilen;
+        struct ntlmv2_resp *buf;
-        buf->blob_signature = cpu_to_le32(0x00000101);
+        char ntlmv2_hash[16];
-        buf->reserved = 0;
+        unsigned char *tiblob = NULL; /* target info blob */
-        buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-        get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
-        buf->reserved2 = 0;
        if (ses->server->secType == RawNTLMSSP) {
                if (!ses->domainName) {
-                        rc = find_domain_name(ses);
+                        rc = find_domain_name(ses, nls_cp);
                        if (rc) {
                                cERROR(1, "error %d finding domain name", rc);
                                goto setup_ntlmv2_rsp_ret;
@@ -490,51 +575,179 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
                rc = build_avpair_blob(ses, nls_cp);
                if (rc) {
                        cERROR(1, "error %d building av pair blob", rc);
-                        return rc;
+                        goto setup_ntlmv2_rsp_ret;
                }
        }
-        /* calculate buf->ntlmv2_hash */
+        baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
-        rc = calc_ntlmv2_hash(ses, nls_cp);
+        tilen = ses->auth_key.len;
+        tiblob = ses->auth_key.response;
+        ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
+        if (!ses->auth_key.response) {
+                rc = ENOMEM;
+                ses->auth_key.len = 0;
+                cERROR(1, "%s: Can't allocate auth blob", __func__);
+                goto setup_ntlmv2_rsp_ret;
+        }
+        ses->auth_key.len += baselen;
+        buf = (struct ntlmv2_resp *)
+                        (ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+        buf->blob_signature = cpu_to_le32(0x00000101);
+        buf->reserved = 0;
+        buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+        get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
+        buf->reserved2 = 0;
+        memcpy(ses->auth_key.response + baselen, tiblob, tilen);
+        /* calculate ntlmv2_hash */
+        rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
        if (rc) {
                cERROR(1, "could not get v2 hash rc %d", rc);
                goto setup_ntlmv2_rsp_ret;
        }
-        CalcNTLMv2_response(ses, resp_buf);
+        /* calculate first part of the client response (CR1) */
+        rc = CalcNTLMv2_response(ses, ntlmv2_hash);
+        if (rc) {
+                cERROR(1, "Could not calculate CR1  rc: %d", rc);
+                goto setup_ntlmv2_rsp_ret;
+        }
        /* now calculate the session key for NTLMv2 */
-        hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context);
+        crypto_shash_setkey(ses->server->secmech.hmacmd5,
-        hmac_md5_update(resp_buf, 16, &context);
+                ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
-        hmac_md5_final(ses->auth_key.data.ntlmv2.key, &context);
+        rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+        if (rc) {
+                cERROR(1, "%s: Could not init hmacmd5\n", __func__);
+                goto setup_ntlmv2_rsp_ret;
+        }
-        memcpy(&ses->auth_key.data.ntlmv2.resp, resp_buf,
+        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
-               sizeof(struct ntlmv2_resp));
+                ses->auth_key.response + CIFS_SESS_KEY_SIZE,
-        ses->auth_key.len = 16 + sizeof(struct ntlmv2_resp);
+                CIFS_HMAC_MD5_HASH_SIZE);
-        return 0;
+        rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+                ses->auth_key.response);
 setup_ntlmv2_rsp_ret:
-        kfree(ses->tiblob);
+        kfree(tiblob);
-        ses->tiblob = NULL;
-        ses->tilen = 0;
        return rc;
 }
-void CalcNTLMv2_response(const struct cifsSesInfo *ses,
+int
-                         char *v2_session_response)
+calc_seckey(struct cifsSesInfo *ses)
 {
-        struct HMACMD5Context context;
+        int rc;
-        /* rest of v2 struct already generated */
+        struct crypto_blkcipher *tfm_arc4;
-        memcpy(v2_session_response + 8, ses->cryptKey, 8);
+        struct scatterlist sgin, sgout;
-        hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context);
+        struct blkcipher_desc desc;
+        unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
+        get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
+        tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+        if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
+                cERROR(1, "could not allocate crypto API arc4\n");
+                return PTR_ERR(tfm_arc4);
+        }
-        hmac_md5_update(v2_session_response+8,
+        desc.tfm = tfm_arc4;
-                        sizeof(struct ntlmv2_resp) - 8, &context);
-        if (ses->tilen)
+        crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
-                hmac_md5_update(ses->tiblob, ses->tilen, &context);
+                                        CIFS_SESS_KEY_SIZE);
-        hmac_md5_final(v2_session_response, &context);
+        sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
-/*      cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */
+        sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
+        rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
+        if (rc) {
+                cERROR(1, "could not encrypt session key rc: %d\n", rc);
+                crypto_free_blkcipher(tfm_arc4);
+                return rc;
+        }
+        /* make secondary_key/nonce as session key */
+        memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
+        /* and make len as that of session key only */
+        ses->auth_key.len = CIFS_SESS_KEY_SIZE;
+        crypto_free_blkcipher(tfm_arc4);
+        return 0;
+}
+void
+cifs_crypto_shash_release(struct TCP_Server_Info *server)
+{
+        if (server->secmech.md5)
+                crypto_free_shash(server->secmech.md5);
+        if (server->secmech.hmacmd5)
+                crypto_free_shash(server->secmech.hmacmd5);
+        kfree(server->secmech.sdeschmacmd5);
+        kfree(server->secmech.sdescmd5);
+}
+int
+cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+        int rc;
+        unsigned int size;
+        server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
+        if (!server->secmech.hmacmd5 ||
+                        IS_ERR(server->secmech.hmacmd5)) {
+                cERROR(1, "could not allocate crypto hmacmd5\n");
+                return PTR_ERR(server->secmech.hmacmd5);
+        }
+        server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
+        if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) {
+                cERROR(1, "could not allocate crypto md5\n");
+                rc = PTR_ERR(server->secmech.md5);
+                goto crypto_allocate_md5_fail;
+        }
+        size = sizeof(struct shash_desc) +
+                        crypto_shash_descsize(server->secmech.hmacmd5);
+        server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+        if (!server->secmech.sdeschmacmd5) {
+                cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
+                rc = -ENOMEM;
+                goto crypto_allocate_hmacmd5_sdesc_fail;
+        }
+        server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
+        server->secmech.sdeschmacmd5->shash.flags = 0x0;
+        size = sizeof(struct shash_desc) +
+                        crypto_shash_descsize(server->secmech.md5);
+        server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
+        if (!server->secmech.sdescmd5) {
+                cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
+                rc = -ENOMEM;
+                goto crypto_allocate_md5_sdesc_fail;
+        }
+        server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
+        server->secmech.sdescmd5->shash.flags = 0x0;
+        return 0;
+crypto_allocate_md5_sdesc_fail:
+        kfree(server->secmech.sdeschmacmd5);
+crypto_allocate_hmacmd5_sdesc_fail:
+        crypto_free_shash(server->secmech.md5);
+crypto_allocate_md5_fail:
+        crypto_free_shash(server->secmech.hmacmd5);
+        return rc;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 34371637f210..75c4eaa79588 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -318,7 +318,6 @@ cifs_alloc_inode(struct super_block *sb)
                return NULL;
        cifs_inode->cifsAttrs = 0x20;   /* default */
        cifs_inode->time = 0;
-        cifs_inode->write_behind_rc = 0;
        /* Until the file is open and we have gotten oplock
        info back from the server, can not assume caching of
        file data or metadata */
@@ -545,9 +544,9 @@ static const struct super_operations cifs_super_ops = {
 #endif
 };
-static int
+static struct dentry *
-cifs_get_sb(struct file_system_type *fs_type,
+cifs_do_mount(struct file_system_type *fs_type,
-            int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+            int flags, const char *dev_name, void *data)
 {
        int rc;
        struct super_block *sb;
@@ -557,18 +556,17 @@ cifs_get_sb(struct file_system_type *fs_type,
        cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
        if (IS_ERR(sb))
-                return PTR_ERR(sb);
+                return ERR_CAST(sb);
        sb->s_flags = flags;
        rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
        if (rc) {
                deactivate_locked_super(sb);
-                return rc;
+                return ERR_PTR(rc);
        }
        sb->s_flags |= MS_ACTIVE;
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 }
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
@@ -634,7 +632,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 struct file_system_type cifs_fs_type = {
        .owner = THIS_MODULE,
        .name = "cifs",
-        .get_sb = cifs_get_sb,
+        .mount = cifs_do_mount,
        .kill_sb = kill_anon_super,
        /*  .fs_flags */
 };
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index f35795a16b42..897b2b2b28b5 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -112,5 +112,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.67"
+#define CIFS_VERSION   "1.68"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 3365e77f6f24..f259e4d7612d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -25,6 +25,9 @@
 #include <linux/workqueue.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
+#include <crypto/internal/hash.h>
+#include <linux/scatterlist.h>
 /*
 * The sizes of various internal tables and strings
 */
@@ -74,7 +77,7 @@
 * CIFS vfs client Status information (based on what we know.)
 */
- /* associated with each tcp and smb session */
+/* associated with each tcp and smb session */
 enum statusEnum {
        CifsNew = 0,
        CifsGood,
@@ -99,14 +102,29 @@ enum protocolEnum {
 struct session_key {
        unsigned int len;
-        union {
+        char *response;
-                char ntlm[CIFS_SESS_KEY_SIZE + 16];
+};
-                char krb5[CIFS_SESS_KEY_SIZE + 16]; /* BB: length correct? */
-                struct {
+/* crypto security descriptor definition */
-                        char key[16];
+struct sdesc {
-                        struct ntlmv2_resp resp;
+        struct shash_desc shash;
-                } ntlmv2;
+        char ctx[];
-        } data;
+};
+/* crypto hashing related structure/fields, not specific to a sec mech */
+struct cifs_secmech {
+        struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
+        struct crypto_shash *md5; /* md5 hash function */
+        struct sdesc *sdeschmacmd5;  /* ctxt to generate ntlmv2 hash, CR1 */
+        struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
+};
+/* per smb session structure/fields */
+struct ntlmssp_auth {
+        __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
+        __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
+        unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
+        char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */
 };
 struct cifs_cred {
@@ -179,12 +197,14 @@ struct TCP_Server_Info {
        int capabilities; /* allow selective disabling of caps by smb sess */
        int timeAdj;  /* Adjust for difference in server time zone in sec */
        __u16 CurrentMid;         /* multiplex id - rotating counter */
+        char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
        /* 16th byte of RFC1001 workstation name is always null */
        char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
        __u32 sequence_number; /* needed for CIFS PDU signature */
        struct session_key session_key;
        unsigned long lstrp; /* when we got last response from this server */
        u16 dialect; /* dialect index that server chose */
+        struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
        /* extended security flavors that server supports */
        bool    sec_kerberos;           /* supports plain Kerberos */
        bool    sec_mskerberos;         /* supports legacy MS Kerberos */
@@ -222,11 +242,8 @@ struct cifsSesInfo {
        char userName[MAX_USERNAME_SIZE + 1];
        char *domainName;
        char *password;
-        char cryptKey[CIFS_CRYPTO_KEY_SIZE];
        struct session_key auth_key;
-        char ntlmv2_hash[16];
+        struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
-        unsigned int tilen; /* length of the target info blob */
-        unsigned char *tiblob; /* target info blob in challenge response */
        bool need_reconnect:1; /* connection reset, uid now invalid */
 };
 /* no more than one of the following three session flags may be set */
@@ -395,16 +412,19 @@ struct cifsFileInfo {
        struct list_head llist; /* list of byte range locks we have. */
        bool invalidHandle:1;   /* file closed via session abend */
        bool oplock_break_cancelled:1;
-        atomic_t count;         /* reference count */
+        int count;              /* refcount protected by cifs_file_list_lock */
        struct mutex fh_mutex; /* prevents reopen race after dead ses*/
        struct cifs_search_info srch_inf;
        struct work_struct oplock_break; /* work for oplock breaks */
 };
-/* Take a reference on the file private data */
+/*
+ * Take a reference on the file private data. Must be called with
+ * cifs_file_list_lock held.
+ */
 static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
 {
-        atomic_inc(&cifs_file->count);
+        ++cifs_file->count;
 }
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
@@ -417,7 +437,6 @@ struct cifsInodeInfo {
        struct list_head lockList;
        /* BB add in lists for dirty pages i.e. write caching info for oplock */
        struct list_head openFileList;
-        int write_behind_rc;
        __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
        unsigned long time;     /* jiffies of last update/check of inode */
        bool clientCanCacheRead:1;      /* read oplock */
@@ -668,7 +687,7 @@ require use of the stronger protocol */
 *  GlobalMid_Lock protects:
 *      list operations on pending_mid_q and oplockQ
 *      updates to XID counters, multiplex id  and SMB sequence numbers
- *  GlobalSMBSesLock protects:
+ *  cifs_file_list_lock protects:
 *      list operations on tcp and SMB session lists and tCon lists
 *  f_owner.lock protects certain per file struct operations
 *  mapping->page_lock protects certain per page operations
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b0f4b5656d4c..de36b09763a8 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -131,9 +131,20 @@
 #define CIFS_CRYPTO_KEY_SIZE (8)
 /*
+ * Size of the ntlm client response
+ */
+#define CIFS_AUTH_RESP_SIZE (24)
+/*
 * Size of the session key (crypto key encrypted with the password
 */
-#define CIFS_SESS_KEY_SIZE (24)
+#define CIFS_SESS_KEY_SIZE (16)
+#define CIFS_CLIENT_CHALLENGE_SIZE (8)
+#define CIFS_SERVER_CHALLENGE_SIZE (8)
+#define CIFS_HMAC_MD5_HASH_SIZE (16)
+#define CIFS_CPHTXT_SIZE (16)
+#define CIFS_NTHASH_SIZE (16)
 /*
 * Maximum user name length
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e593c40ba7ba..edb6d90efdf2 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -362,13 +362,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
                          __u32 *);
 extern int cifs_verify_signature(struct smb_hdr *,
-                                 const struct session_key *session_key,
+                                 struct TCP_Server_Info *server,
                                __u32 expected_sequence_number);
-extern int cifs_calculate_session_key(struct session_key *key, const char *rn,
+extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
-                                 const char *pass);
+extern int setup_ntlm_response(struct cifsSesInfo *);
-extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
+extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
-extern int setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
+extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
-                             const struct nls_table *);
+extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
+extern int calc_seckey(struct cifsSesInfo *);
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern void calc_lanman_hash(const char *password, const char *cryptkey,
                                bool encrypt, char *lnm_session_key);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index e98f1f317b15..2f2632b6df5a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -503,7 +503,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                if (rsp->EncryptionKeyLength ==
                                cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
-                        memcpy(ses->cryptKey, rsp->EncryptionKey,
+                        memcpy(ses->server->cryptkey, rsp->EncryptionKey,
                                CIFS_CRYPTO_KEY_SIZE);
                } else if (server->secMode & SECMODE_PW_ENCRYPT) {
                        rc = -EIO; /* need cryptkey unless plain text */
@@ -574,7 +574,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
        server->timeAdj *= 60;
        if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
-                memcpy(ses->cryptKey, pSMBr->u.EncryptionKey,
+                memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
                       CIFS_CRYPTO_KEY_SIZE);
        } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
                        && (pSMBr->EncryptionKeyLength == 0)) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7e73176acb58..9eb327defa1d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -175,6 +175,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
        }
        server->sequence_number = 0;
        server->session_estab = false;
+        kfree(server->session_key.response);
+        server->session_key.response = NULL;
+        server->session_key.len = 0;
        spin_lock(&GlobalMid_Lock);
        list_for_each(tmp, &server->pending_mid_q) {
@@ -1064,7 +1067,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        }
                        i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
                                                 value, strlen(value));
-                        if (i < 0) {
+                        if (i == 0) {
                                printk(KERN_WARNING "CIFS:  Could not parse"
                                       " srcaddr: %s\n",
                                       value);
@@ -1560,8 +1563,13 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
        server->tcpStatus = CifsExiting;
        spin_unlock(&GlobalMid_Lock);
+        cifs_crypto_shash_release(server);
        cifs_fscache_release_client_cookie(server);
+        kfree(server->session_key.response);
+        server->session_key.response = NULL;
+        server->session_key.len = 0;
        task = xchg(&server->tsk, NULL);
        if (task)
                force_sig(SIGKILL, task);
@@ -1614,10 +1622,16 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                goto out_err;
        }
+        rc = cifs_crypto_shash_allocate(tcp_ses);
+        if (rc) {
+                cERROR(1, "could not setup hash structures rc %d", rc);
+                goto out_err;
+        }
        tcp_ses->hostname = extract_hostname(volume_info->UNC);
        if (IS_ERR(tcp_ses->hostname)) {
                rc = PTR_ERR(tcp_ses->hostname);
-                goto out_err;
+                goto out_err_crypto_release;
        }
        tcp_ses->noblocksnd = volume_info->noblocksnd;
@@ -1661,7 +1675,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        }
        if (rc < 0) {
                cERROR(1, "Error connecting to socket. Aborting operation");
-                goto out_err;
+                goto out_err_crypto_release;
        }
        /*
@@ -1675,7 +1689,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                rc = PTR_ERR(tcp_ses->tsk);
                cERROR(1, "error %d create cifsd thread", rc);
                module_put(THIS_MODULE);
-                goto out_err;
+                goto out_err_crypto_release;
        }
        /* thread spawned, put it on the list */
@@ -1687,6 +1701,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        return tcp_ses;
+out_err_crypto_release:
+        cifs_crypto_shash_release(tcp_ses);
 out_err:
        if (tcp_ses) {
                if (!IS_ERR(tcp_ses->hostname))
@@ -1801,8 +1818,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        if (ses == NULL)
                goto get_ses_fail;
-        ses->tilen = 0;
-        ses->tiblob = NULL;
        /* new SMB session uses our server ref */
        ses->server = server;
        if (server->addr.sockAddr6.sin6_family == AF_INET6)
@@ -1823,10 +1838,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
                        goto get_ses_fail;
        }
        if (volume_info->domainname) {
-                int len = strlen(volume_info->domainname);
+                ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL);
-                ses->domainName = kmalloc(len + 1, GFP_KERNEL);
+                if (!ses->domainName)
-                if (ses->domainName)
+                        goto get_ses_fail;
-                        strcpy(ses->domainName, volume_info->domainname);
        }
        ses->cred_uid = volume_info->cred_uid;
        ses->linux_uid = volume_info->linux_uid;
@@ -2985,13 +2999,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
                    (ses->server->secType == LANMAN))
-                        calc_lanman_hash(tcon->password, ses->cryptKey,
+                        calc_lanman_hash(tcon->password, ses->server->cryptkey,
                                         ses->server->secMode &
                                            SECMODE_PW_ENCRYPT ? true : false,
                                         bcc_ptr);
                else
 #endif /* CIFS_WEAK_PW_HASH */
-                SMBNTencrypt(tcon->password, ses->cryptKey, bcc_ptr);
+                SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
                bcc_ptr += CIFS_SESS_KEY_SIZE;
                if (ses->capabilities & CAP_UNICODE) {
@@ -3178,10 +3192,11 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
        } else {
                mutex_lock(&ses->server->srv_mutex);
                if (!server->session_estab) {
-                        memcpy(&server->session_key.data,
+                        server->session_key.response = ses->auth_key.response;
-                                &ses->auth_key.data, ses->auth_key.len);
                        server->session_key.len = ses->auth_key.len;
-                        ses->server->session_estab = true;
+                        server->sequence_number = 0x2;
+                        server->session_estab = true;
+                        ses->auth_key.response = NULL;
                }
                mutex_unlock(&server->srv_mutex);
@@ -3192,6 +3207,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
                spin_unlock(&GlobalMid_Lock);
        }
+        kfree(ses->auth_key.response);
+        ses->auth_key.response = NULL;
+        ses->auth_key.len = 0;
+        kfree(ses->ntlmssp);
+        ses->ntlmssp = NULL;
        return rc;
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8c81e7b14d53..ae82159cf7fa 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -131,8 +131,7 @@ static inline int cifs_open_inode_helper(struct inode *inode,
                        /* BB no need to lock inode until after invalidate
                        since namei code should already have it locked? */
                        rc = filemap_write_and_wait(inode->i_mapping);
-                        if (rc != 0)
+                        mapping_set_error(inode->i_mapping, rc);
-                                pCifsInode->write_behind_rc = rc;
                }
                cFYI(1, "invalidating remote inode since open detected it "
                         "changed");
@@ -232,6 +231,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
        if (pCifsFile == NULL)
                return pCifsFile;
+        pCifsFile->count = 1;
        pCifsFile->netfid = fileHandle;
        pCifsFile->pid = current->tgid;
        pCifsFile->uid = current_fsuid();
@@ -242,7 +242,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
        mutex_init(&pCifsFile->fh_mutex);
        mutex_init(&pCifsFile->lock_mutex);
        INIT_LIST_HEAD(&pCifsFile->llist);
-        atomic_set(&pCifsFile->count, 1);
        INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
        spin_lock(&cifs_file_list_lock);
@@ -267,7 +266,8 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 /*
 * Release a reference on the file private data. This may involve closing
- * the filehandle out on the server.
+ * the filehandle out on the server. Must be called without holding
+ * cifs_file_list_lock.
 */
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 {
@@ -276,7 +276,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
        struct cifsLockInfo *li, *tmp;
        spin_lock(&cifs_file_list_lock);
-        if (!atomic_dec_and_test(&cifs_file->count)) {
+        if (--cifs_file->count > 0) {
                spin_unlock(&cifs_file_list_lock);
                return;
        }
@@ -605,8 +605,7 @@ reopen_success:
        if (can_flush) {
                rc = filemap_write_and_wait(inode->i_mapping);
-                if (rc != 0)
+                mapping_set_error(inode->i_mapping, rc);
-                        CIFS_I(inode)->write_behind_rc = rc;
                pCifsInode->clientCanCacheAll = false;
                pCifsInode->clientCanCacheRead = false;
@@ -1303,7 +1302,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 static int cifs_writepages(struct address_space *mapping,
                           struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        unsigned int bytes_to_write;
        unsigned int bytes_written;
        struct cifs_sb_info *cifs_sb;
@@ -1326,15 +1324,6 @@ static int cifs_writepages(struct address_space *mapping,
        int scanned = 0;
        int xid, long_op;
-        /*
-         * BB: Is this meaningful for a non-block-device file system?
-         * If it is, we should test it again after we do I/O
-         */
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                wbc->encountered_congestion = 1;
-                return 0;
-        }
        cifs_sb = CIFS_SB(mapping->host->i_sb);
        /*
@@ -1363,6 +1352,7 @@ static int cifs_writepages(struct address_space *mapping,
        if (!experimEnabled && tcon->ses->server->secMode &
                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                cifsFileInfo_put(open_file);
+                kfree(iov);
                return generic_writepages(mapping, wbc);
        }
        cifsFileInfo_put(open_file);
@@ -1488,12 +1478,7 @@ retry:
                        if (rc || bytes_written < bytes_to_write) {
                                cERROR(1, "Write2 ret %d, wrote %d",
                                          rc, bytes_written);
-                                /* BB what if continued retry is
+                                mapping_set_error(mapping, rc);
-                                   requested via mount flags? */
-                                if (rc == -ENOSPC)
-                                        set_bit(AS_ENOSPC, &mapping->flags);
-                                else
-                                        set_bit(AS_EIO, &mapping->flags);
                        } else {
                                cifs_stats_bytes_written(tcon, bytes_written);
                        }
@@ -1638,11 +1623,10 @@ int cifs_fsync(struct file *file, int datasync)
        rc = filemap_write_and_wait(inode->i_mapping);
        if (rc == 0) {
-                rc = CIFS_I(inode)->write_behind_rc;
+                struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-                CIFS_I(inode)->write_behind_rc = 0;
                tcon = tlink_tcon(smbfile->tlink);
-                if (!rc && tcon && smbfile &&
+                if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
-                   !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
                        rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
        }
@@ -1687,21 +1671,8 @@ int cifs_flush(struct file *file, fl_owner_t id)
        struct inode *inode = file->f_path.dentry->d_inode;
        int rc = 0;
-        /* Rather than do the steps manually:
+        if (file->f_mode & FMODE_WRITE)
-           lock the inode for writing
+                rc = filemap_write_and_wait(inode->i_mapping);
-           loop through pages looking for write behind data (dirty pages)
-           coalesce into contiguous 16K (or smaller) chunks to write to server
-           send to server (prefer in parallel)
-           deal with writebehind errors
-           unlock inode for writing
-           filemapfdatawrite appears easier for the time being */
-        rc = filemap_fdatawrite(inode->i_mapping);
-        /* reset wb rc if we were able to write out dirty pages */
-        if (!rc) {
-                rc = CIFS_I(inode)->write_behind_rc;
-                CIFS_I(inode)->write_behind_rc = 0;
-        }
        cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
@@ -2280,7 +2251,7 @@ void cifs_oplock_break(struct work_struct *work)
                                                  oplock_break);
        struct inode *inode = cfile->dentry->d_inode;
        struct cifsInodeInfo *cinode = CIFS_I(inode);
-        int rc, waitrc = 0;
+        int rc = 0;
        if (inode && S_ISREG(inode->i_mode)) {
                if (cinode->clientCanCacheRead)
@@ -2289,13 +2260,10 @@ void cifs_oplock_break(struct work_struct *work)
                        break_lease(inode, O_WRONLY);
                rc = filemap_fdatawrite(inode->i_mapping);
                if (cinode->clientCanCacheRead == 0) {
-                        waitrc = filemap_fdatawait(inode->i_mapping);
+                        rc = filemap_fdatawait(inode->i_mapping);
+                        mapping_set_error(inode->i_mapping, rc);
                        invalidate_remote_inode(inode);
                }
-                if (!rc)
-                        rc = waitrc;
-                if (rc)
-                        cinode->write_behind_rc = rc;
                cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
        }
@@ -2314,7 +2282,7 @@ void cifs_oplock_break(struct work_struct *work)
        /*
         * We might have kicked in before is_valid_oplock_break()
         * finished grabbing reference for us.  Make sure it's done by
-         * waiting for GlobalSMSSeslock.
+         * waiting for cifs_file_list_lock.
         */
        spin_lock(&cifs_file_list_lock);
        spin_unlock(&cifs_file_list_lock);
@@ -2322,6 +2290,7 @@ void cifs_oplock_break(struct work_struct *work)
        cifs_oplock_break_put(cfile);
 }
+/* must be called while holding cifs_file_list_lock */
 void cifs_oplock_break_get(struct cifsFileInfo *cfile)
 {
        cifs_sb_active(cfile->dentry->d_sb);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 94979309698a..39869c3c3efb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1682,8 +1682,7 @@ cifs_invalidate_mapping(struct inode *inode)
        /* write back any cached data */
        if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
                rc = filemap_write_and_wait(inode->i_mapping);
-                if (rc)
+                mapping_set_error(inode->i_mapping, rc);
-                        cifs_i->write_behind_rc = rc;
        }
        invalidate_remote_inode(inode);
        cifs_fscache_reset_inode_cookie(inode);
@@ -1943,10 +1942,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
         * the flush returns error?
         */
        rc = filemap_write_and_wait(inode->i_mapping);
-        if (rc != 0) {
+        mapping_set_error(inode->i_mapping, rc);
-                cifsInode->write_behind_rc = rc;
+        rc = 0;
-                rc = 0;
-        }
        if (attrs->ia_valid & ATTR_SIZE) {
                rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -2087,10 +2084,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
         * the flush returns error?
         */
        rc = filemap_write_and_wait(inode->i_mapping);
-        if (rc != 0) {
+        mapping_set_error(inode->i_mapping, rc);
-                cifsInode->write_behind_rc = rc;
+        rc = 0;
-                rc = 0;
-        }
        if (attrs->ia_valid & ATTR_SIZE) {
                rc = cifs_set_file_size(inode, attrs, xid, full_path);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1c681f6a6803..c4e296fe3518 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -577,7 +577,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                                 * cifs_oplock_break_put() can't be called
                                 * from here.  Get reference after queueing
                                 * succeeded.  cifs_oplock_break() will
-                                 * synchronize using GlobalSMSSeslock.
+                                 * synchronize using cifs_file_list_lock.
                                 */
                                if (queue_work(system_nrt_wq,
                                               &netfile->oplock_break))
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 2a11efd96592..7b01d3f6eed6 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -32,9 +32,6 @@
 #include <linux/slab.h>
 #include "cifs_spnego.h"
-extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
-                         unsigned char *p24);
 /*
 * Checks if this is the first smb session to be reconnected after
 * the socket has been reestablished (so we know whether to use vc 0).
@@ -402,23 +399,22 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
                return -EINVAL;
        }
-        memcpy(ses->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
+        memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
        /* BB we could decode pblob->NegotiateFlags; some may be useful */
        /* In particular we can examine sign flags */
        /* BB spec says that if AvId field of MsvAvTimestamp is populated then
                we must set the MIC field of the AUTHENTICATE_MESSAGE */
+        ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
        tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
        tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
-        ses->tilen = tilen;
+        if (tilen) {
-        if (ses->tilen) {
+                ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
-                ses->tiblob = kmalloc(tilen, GFP_KERNEL);
+                if (!ses->auth_key.response) {
-                if (!ses->tiblob) {
                        cERROR(1, "Challenge target info allocation failure");
-                        ses->tilen = 0;
                        return -ENOMEM;
                }
-                memcpy(ses->tiblob,  bcc_ptr + tioffset, ses->tilen);
+                memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen);
+                ses->auth_key.len = tilen;
        }
        return 0;
@@ -443,10 +439,12 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
                NTLMSSP_NEGOTIATE_NTLM;
        if (ses->server->secMode &
-           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                flags |= NTLMSSP_NEGOTIATE_SIGN;
-        if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
+                if (!ses->server->session_estab)
-                flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+                        flags |= NTLMSSP_NEGOTIATE_KEY_XCH |
+                                NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+        }
        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
@@ -469,11 +467,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                                   const struct nls_table *nls_cp)
 {
        int rc;
-        unsigned int size;
        AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
        __u32 flags;
        unsigned char *tmp;
-        struct ntlmv2_resp ntlmv2_response = {};
        memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
        sec_blob->MessageType = NtLmAuthenticate;
@@ -497,25 +493,19 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        sec_blob->LmChallengeResponse.MaximumLength = 0;
        sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
-        rc = setup_ntlmv2_rsp(ses, (char *)&ntlmv2_response, nls_cp);
+        rc = setup_ntlmv2_rsp(ses, nls_cp);
        if (rc) {
                cERROR(1, "Error %d during NTLMSSP authentication", rc);
                goto setup_ntlmv2_ret;
        }
-        size =  sizeof(struct ntlmv2_resp);
+        memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
-        memcpy(tmp, (char *)&ntlmv2_response, size);
+                        ses->auth_key.len - CIFS_SESS_KEY_SIZE);
-        tmp += size;
+        tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
-        if (ses->tilen > 0) {
-                memcpy(tmp, ses->tiblob, ses->tilen);
-                tmp += ses->tilen;
-        }
-        sec_blob->NtChallengeResponse.Length = cpu_to_le16(size + ses->tilen);
+        sec_blob->NtChallengeResponse.Length =
+                        cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
        sec_blob->NtChallengeResponse.MaximumLength =
-                                cpu_to_le16(size + ses->tilen);
+                        cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
-        kfree(ses->tiblob);
-        ses->tiblob = NULL;
-        ses->tilen = 0;
        if (ses->domainName == NULL) {
                sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -554,9 +544,19 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        sec_blob->WorkstationName.MaximumLength = 0;
        tmp += 2;
-        sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+        if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
-        sec_blob->SessionKey.Length = 0;
+                        !calc_seckey(ses)) {
-        sec_blob->SessionKey.MaximumLength = 0;
+                memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
+                sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+                sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
+                sec_blob->SessionKey.MaximumLength =
+                                cpu_to_le16(CIFS_CPHTXT_SIZE);
+                tmp += CIFS_CPHTXT_SIZE;
+        } else {
+                sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+                sec_blob->SessionKey.Length = 0;
+                sec_blob->SessionKey.MaximumLength = 0;
+        }
 setup_ntlmv2_ret:
        *buflen = tmp - pbuffer;
@@ -600,8 +600,16 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
                return -EINVAL;
        type = ses->server->secType;
        cFYI(1, "sess setup type %d", type);
+        if (type == RawNTLMSSP) {
+                /* if memory allocation is successful, caller of this function
+                 * frees it.
+                 */
+                ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+                if (!ses->ntlmssp)
+                        return -ENOMEM;
+        }
 ssetup_ntlmssp_authenticate:
        if (phase == NtLmChallenge)
                phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -666,10 +674,14 @@ ssetup_ntlmssp_authenticate:
                /* no capabilities flags in old lanman negotiation */
                pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
-                /* BB calculate hash with password */
-                /* and copy into bcc */
-                calc_lanman_hash(ses->password, ses->cryptKey,
+                /* Calculate hash with password and copy into bcc_ptr.
+                 * Encryption Key (stored as in cryptkey) gets used if the
+                 * security mode bit in Negottiate Protocol response states
+                 * to use challenge/response method (i.e. Password bit is 1).
+                 */
+                calc_lanman_hash(ses->password, ses->server->cryptkey,
                                 ses->server->secMode & SECMODE_PW_ENCRYPT ?
                                        true : false, lnm_session_key);
@@ -687,24 +699,27 @@ ssetup_ntlmssp_authenticate:
                ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif
        } else if (type == NTLM) {
-                char ntlm_session_key[CIFS_SESS_KEY_SIZE];
                pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
                pSMB->req_no_secext.CaseInsensitivePasswordLength =
-                        cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                        cpu_to_le16(CIFS_AUTH_RESP_SIZE);
                pSMB->req_no_secext.CaseSensitivePasswordLength =
-                        cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                        cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+                /* calculate ntlm response and session key */
+                rc = setup_ntlm_response(ses);
+                if (rc) {
+                        cERROR(1, "Error %d during NTLM authentication", rc);
+                        goto ssetup_exit;
+                }
-                /* calculate session key */
+                /* copy ntlm response */
-                SMBNTencrypt(ses->password, ses->cryptKey, ntlm_session_key);
+                memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+                                CIFS_AUTH_RESP_SIZE);
+                bcc_ptr += CIFS_AUTH_RESP_SIZE;
+                memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+                                CIFS_AUTH_RESP_SIZE);
+                bcc_ptr += CIFS_AUTH_RESP_SIZE;
-                cifs_calculate_session_key(&ses->auth_key,
-                                        ntlm_session_key, ses->password);
-                /* copy session key */
-                memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
-                bcc_ptr += CIFS_SESS_KEY_SIZE;
-                memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
-                bcc_ptr += CIFS_SESS_KEY_SIZE;
                if (ses->capabilities & CAP_UNICODE) {
                        /* unicode strings must be word aligned */
                        if (iov[0].iov_len % 2) {
@@ -715,47 +730,26 @@ ssetup_ntlmssp_authenticate:
                } else
                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
        } else if (type == NTLMv2) {
-                char *v2_sess_key =
-                        kmalloc(sizeof(struct ntlmv2_resp), GFP_KERNEL);
-                /* BB FIXME change all users of v2_sess_key to
-                   struct ntlmv2_resp */
-                if (v2_sess_key == NULL) {
-                        rc = -ENOMEM;
-                        goto ssetup_exit;
-                }
                pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
                /* LM2 password would be here if we supported it */
                pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
-                /*      cpu_to_le16(LM2_SESS_KEY_SIZE); */
-                /* calculate session key */
+                /* calculate nlmv2 response and session key */
-                rc = setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
+                rc = setup_ntlmv2_rsp(ses, nls_cp);
                if (rc) {
                        cERROR(1, "Error %d during NTLMv2 authentication", rc);
-                        kfree(v2_sess_key);
                        goto ssetup_exit;
                }
-                memcpy(bcc_ptr, (char *)v2_sess_key,
+                memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
-                                sizeof(struct ntlmv2_resp));
+                                ses->auth_key.len - CIFS_SESS_KEY_SIZE);
-                bcc_ptr += sizeof(struct ntlmv2_resp);
+                bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
-                kfree(v2_sess_key);
                /* set case sensitive password length after tilen may get
                 * assigned, tilen is 0 otherwise.
                 */
                pSMB->req_no_secext.CaseSensitivePasswordLength =
-                        cpu_to_le16(sizeof(struct ntlmv2_resp) + ses->tilen);
+                        cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
-                if (ses->tilen > 0) {
-                        memcpy(bcc_ptr, ses->tiblob, ses->tilen);
-                        bcc_ptr += ses->tilen;
-                        /* we never did allocate ses->domainName to free */
-                        kfree(ses->tiblob);
-                        ses->tiblob = NULL;
-                        ses->tilen = 0;
-                }
                if (ses->capabilities & CAP_UNICODE) {
                        if (iov[0].iov_len % 2) {
@@ -768,6 +762,7 @@ ssetup_ntlmssp_authenticate:
        } else if (type == Kerberos) {
 #ifdef CONFIG_CIFS_UPCALL
                struct cifs_spnego_msg *msg;
                spnego_key = cifs_get_spnego_key(ses);
                if (IS_ERR(spnego_key)) {
                        rc = PTR_ERR(spnego_key);
@@ -785,16 +780,17 @@ ssetup_ntlmssp_authenticate:
                        rc = -EKEYREJECTED;
                        goto ssetup_exit;
                }
-                /* bail out if key is too long */
-                if (msg->sesskey_len >
+                ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL);
-                    sizeof(ses->auth_key.data.krb5)) {
+                if (!ses->auth_key.response) {
-                        cERROR(1, "Kerberos signing key too long (%u bytes)",
+                        cERROR(1, "Kerberos can't allocate (%u bytes) memory",
-                                msg->sesskey_len);
+                                        msg->sesskey_len);
-                        rc = -EOVERFLOW;
+                        rc = -ENOMEM;
                        goto ssetup_exit;
                }
+                memcpy(ses->auth_key.response, msg->data, msg->sesskey_len);
                ses->auth_key.len = msg->sesskey_len;
-                memcpy(ses->auth_key.data.krb5, msg->data, msg->sesskey_len);
                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
                capabilities |= CAP_EXTENDED_SECURITY;
                pSMB->req.Capabilities = cpu_to_le32(capabilities);
@@ -897,8 +893,6 @@ ssetup_ntlmssp_authenticate:
                          CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
        /* SMB request buf freed in SendReceive2 */
-        cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
        pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
        smb_buf = (struct smb_hdr *)iov[0].iov_base;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index a66c91eb6eb4..e0588cdf4cc5 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
                                             SECMODE_SIGN_ENABLED))) {
                        rc = cifs_verify_signature(midQ->resp_buf,
-                                                &ses->server->session_key,
+                                                ses->server,
                                                midQ->sequence_number+1);
                        if (rc) {
                                cERROR(1, "Unexpected SMB signature");
@@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
                                             SECMODE_SIGN_ENABLED))) {
                        rc = cifs_verify_signature(out_buf,
-                                                &ses->server->session_key,
+                                                ses->server,
                                                midQ->sequence_number+1);
                        if (rc) {
                                cERROR(1, "Unexpected SMB signature");
@@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
            (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
                                     SECMODE_SIGN_ENABLED))) {
                rc = cifs_verify_signature(out_buf,
-                                           &ses->server->session_key,
+                                           ses->server,
                                           midQ->sequence_number+1);
                if (rc) {
                        cERROR(1, "Unexpected SMB signature");
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index a5bf5771a22a..9060f08e70cf 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/spinlock.h>
 #include <linux/coda.h>
 #include <linux/coda_linux.h>
@@ -31,19 +32,23 @@ void coda_cache_enter(struct inode *inode, int mask)
 {
        struct coda_inode_info *cii = ITOC(inode);
+        spin_lock(&cii->c_lock);
        cii->c_cached_epoch = atomic_read(&permission_epoch);
        if (cii->c_uid != current_fsuid()) {
                cii->c_uid = current_fsuid();
                cii->c_cached_perm = mask;
        } else
                cii->c_cached_perm |= mask;
+        spin_unlock(&cii->c_lock);
 }
 /* remove cached acl from an inode */
 void coda_cache_clear_inode(struct inode *inode)
 {
        struct coda_inode_info *cii = ITOC(inode);
+        spin_lock(&cii->c_lock);
        cii->c_cached_epoch = atomic_read(&permission_epoch) - 1;
+        spin_unlock(&cii->c_lock);
 }
 /* remove all acl caches */
@@ -57,13 +62,15 @@ void coda_cache_clear_all(struct super_block *sb)
 int coda_cache_check(struct inode *inode, int mask)
 {
        struct coda_inode_info *cii = ITOC(inode);
-        int hit;
+        int hit;
        
-        hit = (mask & cii->c_cached_perm) == mask &&
+        spin_lock(&cii->c_lock);
-                cii->c_uid == current_fsuid() &&
+        hit = (mask & cii->c_cached_perm) == mask &&
-                cii->c_cached_epoch == atomic_read(&permission_epoch);
+            cii->c_uid == current_fsuid() &&
+            cii->c_cached_epoch == atomic_read(&permission_epoch);
+        spin_unlock(&cii->c_lock);
-        return hit;
+        return hit;
 }
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index a7a780929eec..602240569c89 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -45,13 +45,15 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
 static int coda_test_inode(struct inode *inode, void *data)
 {
        struct CodaFid *fid = (struct CodaFid *)data;
-        return coda_fideq(&(ITOC(inode)->c_fid), fid);
+        struct coda_inode_info *cii = ITOC(inode);
+        return coda_fideq(&cii->c_fid, fid);
 }
 static int coda_set_inode(struct inode *inode, void *data)
 {
        struct CodaFid *fid = (struct CodaFid *)data;
-        ITOC(inode)->c_fid = *fid;
+        struct coda_inode_info *cii = ITOC(inode);
+        cii->c_fid = *fid;
        return 0;
 }
@@ -71,6 +73,7 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
                cii = ITOC(inode);
                /* we still need to set i_ino for things like stat(2) */
                inode->i_ino = hash;
+                /* inode is locked and unique, no need to grab cii->c_lock */
                cii->c_mapcount = 0;
                unlock_new_inode(inode);
        }
@@ -107,14 +110,20 @@ int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_bloc
 }
+/* Although we treat Coda file identifiers as immutable, there is one
+ * special case for files created during a disconnection where they may
+ * not be globally unique. When an identifier collision is detected we
+ * first try to flush the cached inode from the kernel and finally
+ * resort to renaming/rehashing in-place. Userspace remembers both old
+ * and new values of the identifier to handle any in-flight upcalls.
+ * The real solution is to use globally unique UUIDs as identifiers, but
+ * retrofitting the existing userspace code for this is non-trivial. */
 void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, 
                      struct CodaFid *newfid)
 {
-        struct coda_inode_info *cii;
+        struct coda_inode_info *cii = ITOC(inode);
        unsigned long hash = coda_f2i(newfid);
        
-        cii = ITOC(inode);
        BUG_ON(!coda_fideq(&cii->c_fid, oldfid));
        /* replace fid and rehash inode */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index ccd98b0f2b0b..5d8b35539601 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -17,7 +17,7 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 #include <asm/uaccess.h>
@@ -116,15 +116,11 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
                goto exit;
        }
-        lock_kernel();
        error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
                             &type, &resfid);
        if (!error)
                error = coda_cnode_make(&inode, &resfid, dir->i_sb);
-        unlock_kernel();
        if (error && error != -ENOENT)
                return ERR_PTR(error);
@@ -140,28 +136,24 @@ exit:
 int coda_permission(struct inode *inode, int mask)
 {
-        int error = 0;
+        int error;
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
        if (!mask)
-                return 0; 
+                return 0;
        if ((mask & MAY_EXEC) && !execute_ok(inode))
                return -EACCES;
-        lock_kernel();
        if (coda_cache_check(inode, mask))
-                goto out; 
+                return 0;
-        error = venus_access(inode->i_sb, coda_i2f(inode), mask);
+        error = venus_access(inode->i_sb, coda_i2f(inode), mask);
    
        if (!error)
                coda_cache_enter(inode, mask);
- out:
-        unlock_kernel();
        return error;
 }
@@ -200,41 +192,34 @@ static inline void coda_dir_drop_nlink(struct inode *dir)
 /* creation routines: create, mknod, mkdir, link, symlink */
 static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd)
 {
-        int error=0;
+        int error;
        const char *name=de->d_name.name;
        int length=de->d_name.len;
        struct inode *inode;
        struct CodaFid newfid;
        struct coda_vattr attrs;
-        lock_kernel();
+        if (coda_isroot(dir) && coda_iscontrol(name, length))
-        if (coda_isroot(dir) && coda_iscontrol(name, length)) {
-                unlock_kernel();
                return -EPERM;
-        }
        error = venus_create(dir->i_sb, coda_i2f(dir), name, length, 
                                0, mode, &newfid, &attrs);
+        if (error)
-        if ( error ) {
+                goto err_out;
-                unlock_kernel();
-                d_drop(de);
-                return error;
-        }
        inode = coda_iget(dir->i_sb, &newfid, &attrs);
-        if ( IS_ERR(inode) ) {
+        if (IS_ERR(inode)) {
-                unlock_kernel();
+                error = PTR_ERR(inode);
-                d_drop(de);
+                goto err_out;
-                return PTR_ERR(inode);
        }
        /* invalidate the directory cnode's attributes */
        coda_dir_update_mtime(dir);
-        unlock_kernel();
        d_instantiate(de, inode);
        return 0;
+err_out:
+        d_drop(de);
+        return error;
 }
 static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
@@ -246,36 +231,29 @@ static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
        int error;
        struct CodaFid newfid;
-        lock_kernel();
+        if (coda_isroot(dir) && coda_iscontrol(name, len))
-        if (coda_isroot(dir) && coda_iscontrol(name, len)) {
-                unlock_kernel();
                return -EPERM;
-        }
        attrs.va_mode = mode;
        error = venus_mkdir(dir->i_sb, coda_i2f(dir), 
                               name, len, &newfid, &attrs);
-        
+        if (error)
-        if ( error ) {
+                goto err_out;
-                unlock_kernel();
-                d_drop(de);
-                return error;
-        }
         
        inode = coda_iget(dir->i_sb, &newfid, &attrs);
-        if ( IS_ERR(inode) ) {
+        if (IS_ERR(inode)) {
-                unlock_kernel();
+                error = PTR_ERR(inode);
-                d_drop(de);
+                goto err_out;
-                return PTR_ERR(inode);
        }
        /* invalidate the directory cnode's attributes */
        coda_dir_inc_nlink(dir);
        coda_dir_update_mtime(dir);
-        unlock_kernel();
        d_instantiate(de, inode);
        return 0;
+err_out:
+        d_drop(de);
+        return error;
 }
 /* try to make de an entry in dir_inodde linked to source_de */ 
@@ -287,52 +265,38 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
        int len = de->d_name.len;
        int error;
-        lock_kernel();
+        if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
-        if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
-                unlock_kernel();
                return -EPERM;
-        }
        error = venus_link(dir_inode->i_sb, coda_i2f(inode),
                           coda_i2f(dir_inode), (const char *)name, len);
        if (error) {
                d_drop(de);
-                goto out;
+                return error;
        }
        coda_dir_update_mtime(dir_inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(de, inode);
        inc_nlink(inode);
+        return 0;
-out:
-        unlock_kernel();
-        return(error);
 }
 static int coda_symlink(struct inode *dir_inode, struct dentry *de,
                        const char *symname)
 {
-        const char *name = de->d_name.name;
+        const char *name = de->d_name.name;
        int len = de->d_name.len;
        int symlen;
-        int error = 0;
+        int error;
-        lock_kernel();
-        if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
+        if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
-                unlock_kernel();
                return -EPERM;
-        }
        symlen = strlen(symname);
-        if ( symlen > CODA_MAXPATHLEN ) {
+        if (symlen > CODA_MAXPATHLEN)
-                unlock_kernel();
                return -ENAMETOOLONG;
-        }
        /*
         * This entry is now negative. Since we do not create
@@ -343,10 +307,9 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de,
                              symname, symlen);
        /* mtime is no good anymore */
-        if ( !error )
+        if (!error)
                coda_dir_update_mtime(dir_inode);
-        unlock_kernel();
        return error;
 }
@@ -357,17 +320,12 @@ static int coda_unlink(struct inode *dir, struct dentry *de)
        const char *name = de->d_name.name;
        int len = de->d_name.len;
-        lock_kernel();
        error = venus_remove(dir->i_sb, coda_i2f(dir), name, len);
-        if ( error ) {
+        if (error)
-                unlock_kernel();
                return error;
-        }
        coda_dir_update_mtime(dir);
        drop_nlink(de->d_inode);
-        unlock_kernel();
        return 0;
 }
@@ -377,8 +335,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
        int len = de->d_name.len;
        int error;
-        lock_kernel();
        error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
        if (!error) {
                /* VFS may delete the child */
@@ -389,7 +345,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
                coda_dir_drop_nlink(dir);
                coda_dir_update_mtime(dir);
        }
-        unlock_kernel();
        return error;
 }
@@ -403,15 +358,12 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
        int new_length = new_dentry->d_name.len;
        int error;
-        lock_kernel();
        error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
                             coda_i2f(new_dir), old_length, new_length,
                             (const char *) old_name, (const char *)new_name);
+        if (!error) {
-        if ( !error ) {
+                if (new_dentry->d_inode) {
-                if ( new_dentry->d_inode ) {
+                        if (S_ISDIR(new_dentry->d_inode->i_mode)) {
-                        if ( S_ISDIR(new_dentry->d_inode->i_mode) ) {
                                coda_dir_drop_nlink(old_dir);
                                coda_dir_inc_nlink(new_dir);
                        }
@@ -423,8 +375,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
                        coda_flag_inode(new_dir, C_VATTR);
                }
        }
-        unlock_kernel();
        return error;
 }
@@ -594,10 +544,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
        struct inode *inode = de->d_inode;
        struct coda_inode_info *cii;
-        if (!inode)
+        if (!inode || coda_isroot(inode))
-                return 1;
-        lock_kernel();
-        if (coda_isroot(inode))
                goto out;
        if (is_bad_inode(inode))
                goto bad;
@@ -617,13 +564,12 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
                goto out;
        /* clear the flags. */
+        spin_lock(&cii->c_lock);
        cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
+        spin_unlock(&cii->c_lock);
 bad:
-        unlock_kernel();
        return 0;
 out:
-        unlock_kernel();
        return 1;
 }
@@ -656,20 +602,19 @@ static int coda_dentry_delete(struct dentry * dentry)
 int coda_revalidate_inode(struct dentry *dentry)
 {
        struct coda_vattr attr;
-        int error = 0;
+        int error;
        int old_mode;
        ino_t old_ino;
        struct inode *inode = dentry->d_inode;
        struct coda_inode_info *cii = ITOC(inode);
-        lock_kernel();
+        if (!cii->c_flags)
-        if ( !cii->c_flags )
+                return 0;
-                goto ok;
        if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) {
                error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr);
-                if ( error )
+                if (error)
-                        goto return_bad;
+                        return -EIO;
                /* this inode may be lost if:
                   - it's ino changed 
@@ -688,17 +633,13 @@ int coda_revalidate_inode(struct dentry *dentry)
                /* the following can happen when a local fid is replaced 
                   with a global one, here we lose and declare the inode bad */
                if (inode->i_ino != old_ino)
-                        goto return_bad;
+                        return -EIO;
                
                coda_flag_inode_children(inode, C_FLUSH);
+                spin_lock(&cii->c_lock);
                cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
+                spin_unlock(&cii->c_lock);
        }
-ok:
-        unlock_kernel();
        return 0;
-return_bad:
-        unlock_kernel();
-        return -EIO;
 }
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ad3cd2abeeb4..c8b50ba4366a 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -15,7 +15,7 @@
 #include <linux/stat.h>
 #include <linux/cred.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
@@ -109,19 +109,24 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
        coda_inode = coda_file->f_path.dentry->d_inode;
        host_inode = host_file->f_path.dentry->d_inode;
+        cii = ITOC(coda_inode);
+        spin_lock(&cii->c_lock);
        coda_file->f_mapping = host_file->f_mapping;
        if (coda_inode->i_mapping == &coda_inode->i_data)
                coda_inode->i_mapping = host_inode->i_mapping;
        /* only allow additional mmaps as long as userspace isn't changing
         * the container file on us! */
-        else if (coda_inode->i_mapping != host_inode->i_mapping)
+        else if (coda_inode->i_mapping != host_inode->i_mapping) {
+                spin_unlock(&cii->c_lock);
                return -EBUSY;
+        }
        /* keep track of how often the coda_inode/host_file has been mmapped */
-        cii = ITOC(coda_inode);
        cii->c_mapcount++;
        cfi->cfi_mapcount++;
+        spin_unlock(&cii->c_lock);
        return host_file->f_op->mmap(host_file, vma);
 }
@@ -138,8 +143,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
        if (!cfi)
                return -ENOMEM;
-        lock_kernel();
        error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags,
                           &host_file);
        if (!host_file)
@@ -147,7 +150,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
        if (error) {
                kfree(cfi);
-                unlock_kernel();
                return error;
        }
@@ -159,8 +161,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
        BUG_ON(coda_file->private_data != NULL);
        coda_file->private_data = cfi;
-        unlock_kernel();
        return 0;
 }
@@ -171,9 +171,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
        struct coda_file_info *cfi;
        struct coda_inode_info *cii;
        struct inode *host_inode;
-        int err = 0;
+        int err;
-        lock_kernel();
        cfi = CODA_FTOC(coda_file);
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -185,18 +183,18 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
        cii = ITOC(coda_inode);
        /* did we mmap this file? */
+        spin_lock(&cii->c_lock);
        if (coda_inode->i_mapping == &host_inode->i_data) {
                cii->c_mapcount -= cfi->cfi_mapcount;
                if (!cii->c_mapcount)
                        coda_inode->i_mapping = &coda_inode->i_data;
        }
+        spin_unlock(&cii->c_lock);
        fput(cfi->cfi_container);
        kfree(coda_file->private_data);
        coda_file->private_data = NULL;
-        unlock_kernel();
        /* VFS fput ignores the return value from file_operations->release, so
         * there is no use returning an error here */
        return 0;
@@ -207,7 +205,7 @@ int coda_fsync(struct file *coda_file, int datasync)
        struct file *host_file;
        struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
        struct coda_file_info *cfi;
-        int err = 0;
+        int err;
        if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) ||
              S_ISLNK(coda_inode->i_mode)))
@@ -218,11 +216,8 @@ int coda_fsync(struct file *coda_file, int datasync)
        host_file = cfi->cfi_container;
        err = vfs_fsync(host_file, datasync);
-        if ( !err && !datasync ) {
+        if (!err && !datasync)
-                lock_kernel();
                err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
-                unlock_kernel();
-        }
        return err;
 }
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index bfe8179b1295..5ea57c8c7f97 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -15,7 +15,8 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/unistd.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/file.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
@@ -51,6 +52,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
        ei->c_flags = 0;
        ei->c_uid = 0;
        ei->c_cached_perm = 0;
+        spin_lock_init(&ei->c_lock);
        return &ei->vfs_inode;
 }
@@ -143,13 +145,11 @@ static int get_device_index(struct coda_mount_data *data)
 static int coda_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct inode *root = NULL;
-        struct venus_comm *vc = NULL;
+        struct venus_comm *vc;
        struct CodaFid fid;
        int error;
        int idx;
-        lock_kernel();
        idx = get_device_index((struct coda_mount_data *) data);
        /* Ignore errors in data, for backward compatibility */
@@ -159,23 +159,26 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        printk(KERN_INFO "coda_read_super: device index: %i\n", idx);
        vc = &coda_comms[idx];
+        mutex_lock(&vc->vc_mutex);
        if (!vc->vc_inuse) {
                printk("coda_read_super: No pseudo device\n");
-                unlock_kernel();
+                error = -EINVAL;
-                return -EINVAL;
+                goto unlock_out;
        }
-        if ( vc->vc_sb ) {
+        if (vc->vc_sb) {
                printk("coda_read_super: Device already mounted\n");
-                unlock_kernel();
+                error = -EBUSY;
-                return -EBUSY;
+                goto unlock_out;
        }
        error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
        if (error)
-                goto bdi_err;
+                goto unlock_out;
        vc->vc_sb = sb;
+        mutex_unlock(&vc->vc_mutex);
        sb->s_fs_info = vc;
        sb->s_flags |= MS_NOATIME;
@@ -204,28 +207,33 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        printk("coda_read_super: rootinode is %ld dev %s\n", 
               root->i_ino, root->i_sb->s_id);
        sb->s_root = d_alloc_root(root);
-        if (!sb->s_root)
+        if (!sb->s_root) {
+                error = -EINVAL;
                goto error;
-        unlock_kernel();
+        }
        return 0;
- error:
+error:
-        bdi_destroy(&vc->bdi);
- bdi_err:
        if (root)
                iput(root);
-        if (vc)
-                vc->vc_sb = NULL;
-        unlock_kernel();
+        mutex_lock(&vc->vc_mutex);
-        return -EINVAL;
+        bdi_destroy(&vc->bdi);
+        vc->vc_sb = NULL;
+        sb->s_fs_info = NULL;
+unlock_out:
+        mutex_unlock(&vc->vc_mutex);
+        return error;
 }
 static void coda_put_super(struct super_block *sb)
 {
-        bdi_destroy(&coda_vcp(sb)->bdi);
+        struct venus_comm *vcp = coda_vcp(sb);
-        coda_vcp(sb)->vc_sb = NULL;
+        mutex_lock(&vcp->vc_mutex);
+        bdi_destroy(&vcp->bdi);
+        vcp->vc_sb = NULL;
        sb->s_fs_info = NULL;
+        mutex_unlock(&vcp->vc_mutex);
        printk("Coda: Bye bye.\n");
 }
@@ -251,8 +259,6 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
        struct coda_vattr vattr;
        int error;
-        lock_kernel();
-        
        memset(&vattr, 0, sizeof(vattr)); 
        inode->i_ctime = CURRENT_TIME_SEC;
@@ -262,13 +268,10 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
        /* Venus is responsible for truncating the container-file!!! */
        error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr);
-        if ( !error ) {
+        if (!error) {
                coda_vattr_to_iattr(inode, &vattr); 
                coda_cache_clear_inode(inode);
        }
-        unlock_kernel();
        return error;
 }
@@ -282,12 +285,8 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        int error;
        
-        lock_kernel();
        error = venus_statfs(dentry, buf);
-        unlock_kernel();
        if (error) {
                /* fake something like AFS does */
                buf->f_blocks = 9000000;
@@ -307,16 +306,16 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 /* init_coda: used by filesystems.c to register coda */
-static int coda_get_sb(struct file_system_type *fs_type,
+static struct dentry *coda_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, coda_fill_super);
 }
 struct file_system_type coda_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "coda",
-        .get_sb         = coda_get_sb,
+        .mount          = coda_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = FS_BINARY_MOUNTDATA,
 };
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 028a9a0f588b..2fd89b5c5c7b 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -23,8 +23,6 @@
 #include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
-#include <linux/smp_lock.h>
 /* pioctl ops */
 static int coda_ioctl_permission(struct inode *inode, int mask);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
@@ -58,13 +56,9 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
        struct inode *target_inode = NULL;
        struct coda_inode_info *cnp;
-        lock_kernel();
        /* get the Pioctl data arguments from user space */
-        if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
+        if (copy_from_user(&data, (void __user *)user_data, sizeof(data)))
-                error = -EINVAL;
+                return -EINVAL;
-                goto out;
-        }
        /*
         * Look up the pathname. Note that the pathname is in
@@ -76,13 +70,12 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
                error = user_lpath(data.path, &path);
        if (error)
-                goto out;
+                return error;
-        else
-                target_inode = path.dentry->d_inode;
+        target_inode = path.dentry->d_inode;
        /* return if it is not a Coda inode */
        if (target_inode->i_sb != inode->i_sb) {
-                path_put(&path);
                error = -EINVAL;
                goto out;
        }
@@ -91,10 +84,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
        cnp = ITOC(target_inode);
        error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
-        path_put(&path);
 out:
-        unlock_kernel();
+        path_put(&path);
        return error;
 }
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index fdc2f3ef7ecd..62647a8595e4 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -35,7 +35,7 @@
 #include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include <linux/device.h>
 #include <asm/io.h>
 #include <asm/system.h>
@@ -67,8 +67,10 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
        unsigned int mask = POLLOUT | POLLWRNORM;
        poll_wait(file, &vcp->vc_waitq, wait);
+        mutex_lock(&vcp->vc_mutex);
        if (!list_empty(&vcp->vc_pending))
                mask |= POLLIN | POLLRDNORM;
+        mutex_unlock(&vcp->vc_mutex);
        return mask;
 }
@@ -108,16 +110,9 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
                return -EFAULT;
        if (DOWNCALL(hdr.opcode)) {
-                struct super_block *sb = NULL;
+                union outputArgs *dcbuf;
-                union outputArgs *dcbuf;
                int size = sizeof(*dcbuf);
-                sb = vcp->vc_sb;
-                if ( !sb ) {
-                        count = nbytes;
-                        goto out;
-                }
                if  ( nbytes < sizeof(struct coda_out_hdr) ) {
                        printk("coda_downcall opc %d uniq %d, not enough!\n",
                               hdr.opcode, hdr.unique);
@@ -137,9 +132,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
                }
                /* what downcall errors does Venus handle ? */
-                lock_kernel();
+                error = coda_downcall(vcp, hdr.opcode, dcbuf);
-                error = coda_downcall(hdr.opcode, dcbuf, sb);
-                unlock_kernel();
                CODA_FREE(dcbuf, nbytes);
                if (error) {
@@ -152,7 +145,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
        }
        
        /* Look for the message on the processing queue. */
-        lock_kernel();
+        mutex_lock(&vcp->vc_mutex);
        list_for_each(lh, &vcp->vc_processing) {
                tmp = list_entry(lh, struct upc_req , uc_chain);
                if (tmp->uc_unique == hdr.unique) {
@@ -161,7 +154,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
                        break;
                }
        }
-        unlock_kernel();
+        mutex_unlock(&vcp->vc_mutex);
        if (!req) {
                printk("psdev_write: msg (%d, %d) not found\n", 
@@ -216,7 +209,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
        if (nbytes == 0)
                return 0;
-        lock_kernel();
+        mutex_lock(&vcp->vc_mutex);
        add_wait_queue(&vcp->vc_waitq, &wait);
        set_current_state(TASK_INTERRUPTIBLE);
@@ -230,7 +223,9 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
                        retval = -ERESTARTSYS;
                        break;
                }
+                mutex_unlock(&vcp->vc_mutex);
                schedule();
+                mutex_lock(&vcp->vc_mutex);
        }
        set_current_state(TASK_RUNNING);
@@ -263,7 +258,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
        CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
        kfree(req);
 out:
-        unlock_kernel();
+        mutex_unlock(&vcp->vc_mutex);
        return (count ? count : retval);
 }
@@ -276,10 +271,10 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
        if (idx < 0 || idx >= MAX_CODADEVS)
                return -ENODEV;
-        lock_kernel();
        err = -EBUSY;
        vcp = &coda_comms[idx];
+        mutex_lock(&vcp->vc_mutex);
        if (!vcp->vc_inuse) {
                vcp->vc_inuse++;
@@ -293,7 +288,7 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
                err = 0;
        }
-        unlock_kernel();
+        mutex_unlock(&vcp->vc_mutex);
        return err;
 }
@@ -308,7 +303,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
                return -1;
        }
-        lock_kernel();
+        mutex_lock(&vcp->vc_mutex);
        /* Wakeup clients so they can return. */
        list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) {
@@ -333,7 +328,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
        file->private_data = NULL;
        vcp->vc_inuse--;
-        unlock_kernel();
+        mutex_unlock(&vcp->vc_mutex);
        return 0;
 }
@@ -362,9 +357,11 @@ static int init_coda_psdev(void)
                err = PTR_ERR(coda_psdev_class);
                goto out_chrdev;
        }               
-        for (i = 0; i < MAX_CODADEVS; i++)
+        for (i = 0; i < MAX_CODADEVS; i++) {
+                mutex_init(&(&coda_comms[i])->vc_mutex);
                device_create(coda_psdev_class, NULL,
                              MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
+        }
        coda_sysctl_init();
        goto out;
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 4513b7258458..af78f007a2b0 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -14,7 +14,6 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/coda.h>
 #include <linux/coda_linux.h>
@@ -29,11 +28,9 @@ static int coda_symlink_filler(struct file *file, struct page *page)
        unsigned int len = PAGE_SIZE;
        char *p = kmap(page);
-        lock_kernel();
        cii = ITOC(inode);
        error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
-        unlock_kernel();
        if (error)
                goto fail;
        SetPageUptodate(page);
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index b8893ab6f9e6..c3563cab9758 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -27,6 +27,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
@@ -606,7 +607,8 @@ static void coda_unblock_signals(sigset_t *old)
                                 (r)->uc_opcode != CODA_RELEASE) || \
                                (r)->uc_flags & CODA_REQ_READ))
-static inline void coda_waitfor_upcall(struct upc_req *req)
+static inline void coda_waitfor_upcall(struct venus_comm *vcp,
+                                       struct upc_req *req)
 {
        DECLARE_WAITQUEUE(wait, current);
        unsigned long timeout = jiffies + coda_timeout * HZ;
@@ -639,10 +641,12 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
                        break;
                }
+                mutex_unlock(&vcp->vc_mutex);
                if (blocked)
                        schedule_timeout(HZ);
                else
                        schedule();
+                mutex_lock(&vcp->vc_mutex);
        }
        if (blocked)
                coda_unblock_signals(&old);
@@ -667,18 +671,23 @@ static int coda_upcall(struct venus_comm *vcp,
 {
        union outputArgs *out;
        union inputArgs *sig_inputArgs;
-        struct upc_req *req, *sig_req;
+        struct upc_req *req = NULL, *sig_req;
-        int error = 0;
+        int error;
+        mutex_lock(&vcp->vc_mutex);
        if (!vcp->vc_inuse) {
                printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
-                return -ENXIO;
+                error = -ENXIO;
+                goto exit;
        }
        /* Format the request message. */
        req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
-        if (!req)
+        if (!req) {
-                return -ENOMEM;
+                error = -ENOMEM;
+                goto exit;
+        }
        req->uc_data = (void *)buffer;
        req->uc_flags = 0;
@@ -705,7 +714,7 @@ static int coda_upcall(struct venus_comm *vcp,
         * ENODEV.  */
        /* Go to sleep.  Wake up on signals only after the timeout. */
-        coda_waitfor_upcall(req);
+        coda_waitfor_upcall(vcp, req);
        /* Op went through, interrupt or not... */
        if (req->uc_flags & CODA_REQ_WRITE) {
@@ -759,6 +768,7 @@ static int coda_upcall(struct venus_comm *vcp,
 exit:
        kfree(req);
+        mutex_unlock(&vcp->vc_mutex);
        return error;
 }
@@ -796,21 +806,24 @@ exit:
 *
 * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
-int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
 {
        struct inode *inode = NULL;
-        struct CodaFid *fid, *newfid;
+        struct CodaFid *fid = NULL, *newfid;
+        struct super_block *sb;
        /* Handle invalidation requests. */
-        if ( !sb || !sb->s_root)
+        mutex_lock(&vcp->vc_mutex);
-                return 0;
+        sb = vcp->vc_sb;
+        if (!sb || !sb->s_root)
+                goto unlock_out;
        switch (opcode) {
        case CODA_FLUSH:
                coda_cache_clear_all(sb);
                shrink_dcache_sb(sb);
                if (sb->s_root->d_inode)
-                    coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
+                        coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
                break;
        case CODA_PURGEUSER:
@@ -819,45 +832,53 @@ int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
        case CODA_ZAPDIR:
                fid = &out->coda_zapdir.CodaFid;
-                inode = coda_fid_to_inode(fid, sb);
-                if (inode) {
-                        coda_flag_inode_children(inode, C_PURGE);
-                        coda_flag_inode(inode, C_VATTR);
-                }
                break;
        case CODA_ZAPFILE:
                fid = &out->coda_zapfile.CodaFid;
-                inode = coda_fid_to_inode(fid, sb);
-                if (inode)
-                        coda_flag_inode(inode, C_VATTR);
                break;
        case CODA_PURGEFID:
                fid = &out->coda_purgefid.CodaFid;
+                break;
+        case CODA_REPLACE:
+                fid = &out->coda_replace.OldFid;
+                break;
+        }
+        if (fid)
                inode = coda_fid_to_inode(fid, sb);
-                if (inode) {
-                        coda_flag_inode_children(inode, C_PURGE);
-                        /* catch the dentries later if some are still busy */
+unlock_out:
-                        coda_flag_inode(inode, C_PURGE);
+        mutex_unlock(&vcp->vc_mutex);
-                        d_prune_aliases(inode);
-                }
+        if (!inode)
+                return 0;
+        switch (opcode) {
+        case CODA_ZAPDIR:
+                coda_flag_inode_children(inode, C_PURGE);
+                coda_flag_inode(inode, C_VATTR);
+                break;
+        case CODA_ZAPFILE:
+                coda_flag_inode(inode, C_VATTR);
+                break;
+        case CODA_PURGEFID:
+                coda_flag_inode_children(inode, C_PURGE);
+                /* catch the dentries later if some are still busy */
+                coda_flag_inode(inode, C_PURGE);
+                d_prune_aliases(inode);
                break;
        case CODA_REPLACE:
-                fid = &out->coda_replace.OldFid;
                newfid = &out->coda_replace.NewFid;
-                inode = coda_fid_to_inode(fid, sb);
+                coda_replace_fid(inode, fid, newfid);
-                if (inode)
-                        coda_replace_fid(inode, fid, newfid);
                break;
        }
+        iput(inode);
-        if (inode)
-                iput(inode);
        return 0;
 }
diff --git a/fs/compat.c b/fs/compat.c
index 0644a154672b..ff66c0d7583d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -29,8 +29,6 @@
 #include <linux/vfs.h>
 #include <linux/ioctl.h>
 #include <linux/init.h>
-#include <linux/smb.h>
-#include <linux/smb_mount.h>
 #include <linux/ncp_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/syscalls.h>
@@ -608,14 +606,14 @@ ssize_t compat_rw_copy_check_uvector(int type,
        /*
         * Single unix specification:
         * We should -EINVAL if an element length is not >= 0 and fitting an
-         * ssize_t.  The total length is fitting an ssize_t
+         * ssize_t.
         *
-         * Be careful here because iov_len is a size_t not an ssize_t
+         * In Linux, the total length is limited to MAX_RW_COUNT, there is
+         * no overflow possibility.
         */
        tot_len = 0;
        ret = -EINVAL;
        for (seg = 0; seg < nr_segs; seg++) {
-                compat_ssize_t tmp = tot_len;
                compat_uptr_t buf;
                compat_ssize_t len;
@@ -626,13 +624,13 @@ ssize_t compat_rw_copy_check_uvector(int type,
                }
                if (len < 0)    /* size_t not fitting in compat_ssize_t .. */
                        goto out;
-                tot_len += len;
-                if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
-                        goto out;
                if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
                        ret = -EFAULT;
                        goto out;
                }
+                if (len > MAX_RW_COUNT - tot_len)
+                        len = MAX_RW_COUNT - tot_len;
+                tot_len += len;
                iov->iov_base = compat_ptr(buf);
                iov->iov_len = (compat_size_t) len;
                uvector++;
@@ -745,30 +743,6 @@ static void *do_ncp_super_data_conv(void *raw_data)
        return raw_data;
 }
-struct compat_smb_mount_data {
-        compat_int_t version;
-        __compat_uid_t mounted_uid;
-        __compat_uid_t uid;
-        __compat_gid_t gid;
-        compat_mode_t file_mode;
-        compat_mode_t dir_mode;
-};
-static void *do_smb_super_data_conv(void *raw_data)
-{
-        struct smb_mount_data *s = raw_data;
-        struct compat_smb_mount_data *c_s = raw_data;
-        if (c_s->version != SMB_MOUNT_OLDVERSION)
-                goto out;
-        s->dir_mode = c_s->dir_mode;
-        s->file_mode = c_s->file_mode;
-        s->gid = c_s->gid;
-        s->uid = c_s->uid;
-        s->mounted_uid = c_s->mounted_uid;
- out:
-        return raw_data;
-}
 struct compat_nfs_string {
        compat_uint_t len;
@@ -835,7 +809,6 @@ static int do_nfs4_super_data_conv(void *raw_data)
        return 0;
 }
-#define SMBFS_NAME      "smbfs"
 #define NCPFS_NAME      "ncpfs"
 #define NFS4_NAME       "nfs4"
@@ -870,9 +843,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
        retval = -EINVAL;
        if (kernel_type && data_page) {
-                if (!strcmp(kernel_type, SMBFS_NAME)) {
+                if (!strcmp(kernel_type, NCPFS_NAME)) {
-                        do_smb_super_data_conv((void *)data_page);
-                } else if (!strcmp(kernel_type, NCPFS_NAME)) {
                        do_ncp_super_data_conv((void *)data_page);
                } else if (!strcmp(kernel_type, NFS4_NAME)) {
                        if (do_nfs4_super_data_conv((void *) data_page))
@@ -1963,7 +1934,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 }
 #endif /* HAVE_SET_RESTORE_SIGMASK */
-#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)
+#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED)
 /* Stuff for NFS server syscalls... */
 struct compat_nfsctl_svc {
        u16                     svc32_port;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d0ad09d57789..410ed188faa1 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -46,7 +46,6 @@
 #include <linux/videodev.h>
 #include <linux/netdevice.h>
 #include <linux/raw.h>
-#include <linux/smb_fs.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/rtc.h>
@@ -558,25 +557,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
 #endif /* CONFIG_BLOCK */
-static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
-                        compat_uid_t __user *argp)
-{
-        mm_segment_t old_fs = get_fs();
-        __kernel_uid_t kuid;
-        int err;
-        cmd = SMB_IOC_GETMOUNTUID;
-        set_fs(KERNEL_DS);
-        err = sys_ioctl(fd, cmd, (unsigned long)&kuid);
-        set_fs(old_fs);
-        if (err >= 0)
-                err = put_user(kuid, argp);
-        return err;
-}
 /* Bluetooth ioctls */
 #define HCIUARTSETPROTO         _IOW('U', 200, int)
 #define HCIUARTGETPROTO         _IOR('U', 201, int)
@@ -1199,8 +1179,9 @@ COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
 COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
 COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
 COMPATIBLE_IOCTL(OSS_GETVERSION)
-/* SMB ioctls which do not need any translations */
+/* Raw devices */
-COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
+COMPATIBLE_IOCTL(RAW_SETBIND)
+COMPATIBLE_IOCTL(RAW_GETBIND)
 /* Watchdog */
 COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
 COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -1458,10 +1439,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
        case MTIOCPOS32:
                return mt_ioctl_trans(fd, cmd, argp);
 #endif
-        /* One SMB ioctl needs translations. */
-#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
-        case SMB_IOC_GETMOUNTUID_32:
-                return do_smb_getmountuid(fd, cmd, argp);
        /* Serial */
        case TIOCGSERIAL:
        case TIOCSSERIAL:
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cf78d44a8d6a..253476d78ed8 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -135,6 +135,7 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
        struct inode * inode = new_inode(configfs_sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mapping->a_ops = &configfs_aops;
                inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
                inode->i_op = &configfs_inode_operations;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8c8d64230c2d..7d3607febe1c 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -104,16 +104,16 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
-static int configfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *configfs_do_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt);
+        return mount_single(fs_type, flags, data, configfs_fill_super);
 }
 static struct file_system_type configfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "configfs",
-        .get_sb         = configfs_get_sb,
+        .mount          = configfs_do_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 1e7a33028d33..32fd5fe9ca0e 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -533,17 +533,16 @@ static const struct super_operations cramfs_ops = {
        .statfs         = cramfs_statfs,
 };
-static int cramfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *cramfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
-                           mnt);
 }
 static struct file_system_type cramfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "cramfs",
-        .get_sb         = cramfs_get_sb,
+        .mount          = cramfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/dcache.c b/fs/dcache.c
index 83293be48149..23702a9d4e6d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -67,33 +67,43 @@ struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
 };
-static void __d_free(struct dentry *dentry)
+static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
+static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
+                   size_t *lenp, loff_t *ppos)
+{
+        dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
+        dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+        return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
+static void __d_free(struct rcu_head *head)
 {
+        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
        WARN_ON(!list_empty(&dentry->d_alias));
        if (dname_external(dentry))
                kfree(dentry->d_name.name);
        kmem_cache_free(dentry_cache, dentry); 
 }
-static void d_callback(struct rcu_head *head)
-{
-        struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
-        __d_free(dentry);
-}
 /*
- * no dcache_lock, please.  The caller must decrement dentry_stat.nr_dentry
+ * no dcache_lock, please.
- * inside dcache_lock.
 */
 static void d_free(struct dentry *dentry)
 {
+        percpu_counter_dec(&nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);
        /* if dentry was never inserted into hash, immediate free is OK */
        if (hlist_unhashed(&dentry->d_hash))
-                __d_free(dentry);
+                __d_free(&dentry->d_u.d_rcu);
        else
-                call_rcu(&dentry->d_u.d_rcu, d_callback);
+                call_rcu(&dentry->d_u.d_rcu, __d_free);
 }
 /*
@@ -123,37 +133,34 @@ static void dentry_iput(struct dentry * dentry)
 }
 /*
- * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
+ * dentry_lru_(add|del|move_tail) must be called with dcache_lock held.
 */
 static void dentry_lru_add(struct dentry *dentry)
 {
-        list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+        if (list_empty(&dentry->d_lru)) {
-        dentry->d_sb->s_nr_dentry_unused++;
+                list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-        dentry_stat.nr_unused++;
+                dentry->d_sb->s_nr_dentry_unused++;
-}
+                percpu_counter_inc(&nr_dentry_unused);
+        }
-static void dentry_lru_add_tail(struct dentry *dentry)
-{
-        list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-        dentry->d_sb->s_nr_dentry_unused++;
-        dentry_stat.nr_unused++;
 }
 static void dentry_lru_del(struct dentry *dentry)
 {
        if (!list_empty(&dentry->d_lru)) {
-                list_del(&dentry->d_lru);
+                list_del_init(&dentry->d_lru);
                dentry->d_sb->s_nr_dentry_unused--;
-                dentry_stat.nr_unused--;
+                percpu_counter_dec(&nr_dentry_unused);
        }
 }
-static void dentry_lru_del_init(struct dentry *dentry)
+static void dentry_lru_move_tail(struct dentry *dentry)
 {
-        if (likely(!list_empty(&dentry->d_lru))) {
+        if (list_empty(&dentry->d_lru)) {
-                list_del_init(&dentry->d_lru);
+                list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-                dentry->d_sb->s_nr_dentry_unused--;
+                dentry->d_sb->s_nr_dentry_unused++;
-                dentry_stat.nr_unused--;
+                percpu_counter_inc(&nr_dentry_unused);
+        } else {
+                list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
        }
 }
@@ -172,7 +179,6 @@ static struct dentry *d_kill(struct dentry *dentry)
        struct dentry *parent;
        list_del(&dentry->d_u.d_child);
-        dentry_stat.nr_dentry--;        /* For d_free, below */
        /*drops the locks, at that point nobody can reach this dentry */
        dentry_iput(dentry);
        if (IS_ROOT(dentry))
@@ -237,13 +243,15 @@ repeat:
                if (dentry->d_op->d_delete(dentry))
                        goto unhash_it;
        }
        /* Unreachable? Get rid of it */
        if (d_unhashed(dentry))
                goto kill_it;
-        if (list_empty(&dentry->d_lru)) {
-                dentry->d_flags |= DCACHE_REFERENCED;
+        /* Otherwise leave it cached and ensure it's on the LRU */
-                dentry_lru_add(dentry);
+        dentry->d_flags |= DCACHE_REFERENCED;
-        }
+        dentry_lru_add(dentry);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&dcache_lock);
        return;
@@ -318,11 +326,10 @@ int d_invalidate(struct dentry * dentry)
 EXPORT_SYMBOL(d_invalidate);
 /* This should be called _only_ with dcache_lock held */
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
        atomic_inc(&dentry->d_count);
-        dentry_lru_del_init(dentry);
+        dentry_lru_del(dentry);
        return dentry;
 }
@@ -441,73 +448,27 @@ static void prune_one_dentry(struct dentry * dentry)
                if (dentry->d_op && dentry->d_op->d_delete)
                        dentry->d_op->d_delete(dentry);
-                dentry_lru_del_init(dentry);
+                dentry_lru_del(dentry);
                __d_drop(dentry);
                dentry = d_kill(dentry);
                spin_lock(&dcache_lock);
        }
 }
-/*
+static void shrink_dentry_list(struct list_head *list)
- * Shrink the dentry LRU on a given superblock.
- * @sb   : superblock to shrink dentry LRU.
- * @count: If count is NULL, we prune all dentries on superblock.
- * @flags: If flags is non-zero, we need to do special processing based on
- * which flags are set. This means we don't need to maintain multiple
- * similar copies of this loop.
- */
-static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
 {
-        LIST_HEAD(referenced);
-        LIST_HEAD(tmp);
        struct dentry *dentry;
-        int cnt = 0;
-        BUG_ON(!sb);
+        while (!list_empty(list)) {
-        BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
+                dentry = list_entry(list->prev, struct dentry, d_lru);
-        spin_lock(&dcache_lock);
+                dentry_lru_del(dentry);
-        if (count != NULL)
-                /* called from prune_dcache() and shrink_dcache_parent() */
-                cnt = *count;
-restart:
-        if (count == NULL)
-                list_splice_init(&sb->s_dentry_lru, &tmp);
-        else {
-                while (!list_empty(&sb->s_dentry_lru)) {
-                        dentry = list_entry(sb->s_dentry_lru.prev,
-                                        struct dentry, d_lru);
-                        BUG_ON(dentry->d_sb != sb);
-                        spin_lock(&dentry->d_lock);
-                        /*
-                         * If we are honouring the DCACHE_REFERENCED flag and
-                         * the dentry has this flag set, don't free it. Clear
-                         * the flag and put it back on the LRU.
-                         */
-                        if ((flags & DCACHE_REFERENCED)
-                                && (dentry->d_flags & DCACHE_REFERENCED)) {
-                                dentry->d_flags &= ~DCACHE_REFERENCED;
-                                list_move(&dentry->d_lru, &referenced);
-                                spin_unlock(&dentry->d_lock);
-                        } else {
-                                list_move_tail(&dentry->d_lru, &tmp);
-                                spin_unlock(&dentry->d_lock);
-                                cnt--;
-                                if (!cnt)
-                                        break;
-                        }
-                        cond_resched_lock(&dcache_lock);
-                }
-        }
-        while (!list_empty(&tmp)) {
-                dentry = list_entry(tmp.prev, struct dentry, d_lru);
-                dentry_lru_del_init(dentry);
-                spin_lock(&dentry->d_lock);
                /*
                 * We found an inuse dentry which was not removed from
                 * the LRU because of laziness during lookup.  Do not free
                 * it - just keep it off the LRU list.
                 */
+                spin_lock(&dentry->d_lock);
                if (atomic_read(&dentry->d_count)) {
                        spin_unlock(&dentry->d_lock);
                        continue;
@@ -516,13 +477,60 @@ restart:
                /* dentry->d_lock was dropped in prune_one_dentry() */
                cond_resched_lock(&dcache_lock);
        }
-        if (count == NULL && !list_empty(&sb->s_dentry_lru))
+}
-                goto restart;
-        if (count != NULL)
+/**
-                *count = cnt;
+ * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
+ * @sb:         superblock to shrink dentry LRU.
+ * @count:      number of entries to prune
+ * @flags:      flags to control the dentry processing
+ *
+ * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
+ */
+static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
+{
+        /* called from prune_dcache() and shrink_dcache_parent() */
+        struct dentry *dentry;
+        LIST_HEAD(referenced);
+        LIST_HEAD(tmp);
+        int cnt = *count;
+        spin_lock(&dcache_lock);
+        while (!list_empty(&sb->s_dentry_lru)) {
+                dentry = list_entry(sb->s_dentry_lru.prev,
+                                struct dentry, d_lru);
+                BUG_ON(dentry->d_sb != sb);
+                /*
+                 * If we are honouring the DCACHE_REFERENCED flag and the
+                 * dentry has this flag set, don't free it.  Clear the flag
+                 * and put it back on the LRU.
+                 */
+                if (flags & DCACHE_REFERENCED) {
+                        spin_lock(&dentry->d_lock);
+                        if (dentry->d_flags & DCACHE_REFERENCED) {
+                                dentry->d_flags &= ~DCACHE_REFERENCED;
+                                list_move(&dentry->d_lru, &referenced);
+                                spin_unlock(&dentry->d_lock);
+                                cond_resched_lock(&dcache_lock);
+                                continue;
+                        }
+                        spin_unlock(&dentry->d_lock);
+                }
+                list_move_tail(&dentry->d_lru, &tmp);
+                if (!--cnt)
+                        break;
+                cond_resched_lock(&dcache_lock);
+        }
+        *count = cnt;
+        shrink_dentry_list(&tmp);
        if (!list_empty(&referenced))
                list_splice(&referenced, &sb->s_dentry_lru);
        spin_unlock(&dcache_lock);
 }
 /**
@@ -538,7 +546,7 @@ static void prune_dcache(int count)
 {
        struct super_block *sb, *p = NULL;
        int w_count;
-        int unused = dentry_stat.nr_unused;
+        int unused = percpu_counter_sum_positive(&nr_dentry_unused);
        int prune_ratio;
        int pruned;
@@ -608,13 +616,19 @@ static void prune_dcache(int count)
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
 *
- * Shrink the dcache for the specified super block. This
+ * Shrink the dcache for the specified super block. This is used to free
- * is used to free the dcache before unmounting a file
+ * the dcache before unmounting a file system.
- * system
 */
-void shrink_dcache_sb(struct super_block * sb)
+void shrink_dcache_sb(struct super_block *sb)
 {
-        __shrink_dcache_sb(sb, NULL, 0);
+        LIST_HEAD(tmp);
+        spin_lock(&dcache_lock);
+        while (!list_empty(&sb->s_dentry_lru)) {
+                list_splice_init(&sb->s_dentry_lru, &tmp);
+                shrink_dentry_list(&tmp);
+        }
+        spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
@@ -632,7 +646,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
        /* detach this root from the system */
        spin_lock(&dcache_lock);
-        dentry_lru_del_init(dentry);
+        dentry_lru_del(dentry);
        __d_drop(dentry);
        spin_unlock(&dcache_lock);
@@ -646,7 +660,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                        spin_lock(&dcache_lock);
                        list_for_each_entry(loop, &dentry->d_subdirs,
                                            d_u.d_child) {
-                                dentry_lru_del_init(loop);
+                                dentry_lru_del(loop);
                                __d_drop(loop);
                                cond_resched_lock(&dcache_lock);
                        }
@@ -703,20 +717,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                         * otherwise we ascend to the parent and move to the
                         * next sibling if there is one */
                        if (!parent)
-                                goto out;
+                                return;
                        dentry = parent;
                } while (list_empty(&dentry->d_subdirs));
                dentry = list_entry(dentry->d_subdirs.next,
                                    struct dentry, d_u.d_child);
        }
-out:
-        /* several dentries were freed, need to correct nr_dentry */
-        spin_lock(&dcache_lock);
-        dentry_stat.nr_dentry -= detached;
-        spin_unlock(&dcache_lock);
 }
 /*
@@ -830,14 +837,15 @@ resume:
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
-                dentry_lru_del_init(dentry);
                /* 
                 * move only zero ref count dentries to the end 
                 * of the unused list for prune_dcache
                 */
                if (!atomic_read(&dentry->d_count)) {
-                        dentry_lru_add_tail(dentry);
+                        dentry_lru_move_tail(dentry);
                        found++;
+                } else {
+                        dentry_lru_del(dentry);
                }
                /*
@@ -900,12 +908,16 @@ EXPORT_SYMBOL(shrink_dcache_parent);
 */
 static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
+        int nr_unused;
        if (nr) {
                if (!(gfp_mask & __GFP_FS))
                        return -1;
                prune_dcache(nr);
        }
-        return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+        nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+        return (nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker dcache_shrinker = {
@@ -972,9 +984,10 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
        spin_lock(&dcache_lock);
        if (parent)
                list_add(&dentry->d_u.d_child, &parent->d_subdirs);
-        dentry_stat.nr_dentry++;
        spin_unlock(&dcache_lock);
+        percpu_counter_inc(&nr_dentry);
        return dentry;
 }
 EXPORT_SYMBOL(d_alloc);
@@ -1478,33 +1491,26 @@ out:
 * This is used by ncpfs in its readdir implementation.
 * Zero is returned in the dentry is invalid.
 */
- 
+int d_validate(struct dentry *dentry, struct dentry *parent)
-int d_validate(struct dentry *dentry, struct dentry *dparent)
 {
-        struct hlist_head *base;
+        struct hlist_head *head = d_hash(parent, dentry->d_name.hash);
-        struct hlist_node *lhp;
+        struct hlist_node *node;
+        struct dentry *d;
        /* Check whether the ptr might be valid at all.. */
        if (!kmem_ptr_validate(dentry_cache, dentry))
-                goto out;
+                return 0;
+        if (dentry->d_parent != parent)
-        if (dentry->d_parent != dparent)
+                return 0;
-                goto out;
-        spin_lock(&dcache_lock);
+        rcu_read_lock();
-        base = d_hash(dparent, dentry->d_name.hash);
+        hlist_for_each_entry_rcu(d, node, head, d_hash) {
-        hlist_for_each(lhp,base) { 
+                if (d == dentry) {
-                /* hlist_for_each_entry_rcu() not required for d_hash list
+                        dget(dentry);
-                 * as it is parsed under dcache_lock
-                 */
-                if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
-                        __dget_locked(dentry);
-                        spin_unlock(&dcache_lock);
                        return 1;
                }
        }
-        spin_unlock(&dcache_lock);
+        rcu_read_unlock();
-out:
        return 0;
 }
 EXPORT_SYMBOL(d_validate);
@@ -1994,7 +2000,7 @@ global_root:
 * Returns a pointer into the buffer or an error code if the
 * path was too long.
 *
- * "buflen" should be positive. Caller holds the dcache_lock.
+ * "buflen" should be positive.
 *
 * If path is not reachable from the supplied root, then the value of
 * root is changed (without modifying refcounts).
@@ -2006,10 +2012,12 @@ char *__d_path(const struct path *path, struct path *root,
        int error;
        prepend(&res, &buflen, "\0", 1);
+        spin_lock(&dcache_lock);
        error = prepend_path(path, root, &res, &buflen);
+        spin_unlock(&dcache_lock);
        if (error)
                return ERR_PTR(error);
        return res;
 }
@@ -2419,6 +2427,9 @@ static void __init dcache_init(void)
 {
        int loop;
+        percpu_counter_init(&nr_dentry, 0);
+        percpu_counter_init(&nr_dentry_unused, 0);
        /* 
         * A constructor could be added for stable state like the lists,
         * but it is probably not worth it because of the cache nature
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30a87b3dbcac..37a8ca7c1222 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,6 +40,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
        struct inode *inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                switch (mode & S_IFMT) {
@@ -134,17 +135,17 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
        return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
 }
-static int debug_get_sb(struct file_system_type *fs_type,
+static struct dentry *debug_mount(struct file_system_type *fs_type,
                        int flags, const char *dev_name,
-                        void *data, struct vfsmount *mnt)
+                        void *data)
 {
-        return get_sb_single(fs_type, flags, data, debug_fill_super, mnt);
+        return mount_single(fs_type, flags, data, debug_fill_super);
 }
 static struct file_system_type debug_fs_type = {
        .owner =        THIS_MODULE,
        .name =         "debugfs",
-        .get_sb =       debug_get_sb,
+        .mount =        debug_mount,
        .kill_sb =      kill_litter_super,
 };
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8b3ffd5b5235..1bb547c9cad6 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -331,7 +331,7 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
 }
 /*
- * devpts_get_sb()
+ * devpts_mount()
 *
 *     If the '-o newinstance' mount option was specified, mount a new
 *     (private) instance of devpts.  PTYs created in this instance are
@@ -345,20 +345,20 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
 *     semantics in devpts while preserving backward compatibility of the
 *     current 'single-namespace' semantics. i.e all mounts of devpts
 *     without the 'newinstance' mount option should bind to the initial
- *     kernel mount, like get_sb_single().
+ *     kernel mount, like mount_single().
 *
 *     Mounts with 'newinstance' option create a new, private namespace.
 *
 *     NOTE:
 *
- *     For single-mount semantics, devpts cannot use get_sb_single(),
+ *     For single-mount semantics, devpts cannot use mount_single(),
- *     because get_sb_single()/sget() find and use the super-block from
+ *     because mount_single()/sget() find and use the super-block from
 *     the most recent mount of devpts. But that recent mount may be a
- *     'newinstance' mount and get_sb_single() would pick the newinstance
+ *     'newinstance' mount and mount_single() would pick the newinstance
 *     super-block instead of the initial super-block.
 */
-static int devpts_get_sb(struct file_system_type *fs_type,
+static struct dentry *devpts_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
        int error;
        struct pts_mount_opts opts;
@@ -366,7 +366,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
        error = parse_mount_options(data, PARSE_MOUNT, &opts);
        if (error)
-                return error;
+                return ERR_PTR(error);
        if (opts.newinstance)
                s = sget(fs_type, NULL, set_anon_super, NULL);
@@ -374,7 +374,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
                s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
        if (IS_ERR(s))
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        if (!s->s_root) {
                s->s_flags = flags;
@@ -390,13 +390,11 @@ static int devpts_get_sb(struct file_system_type *fs_type,
        if (error)
                goto out_undo_sget;
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 out_undo_sget:
        deactivate_locked_super(s);
-        return error;
+        return ERR_PTR(error);
 }
 #else
@@ -404,10 +402,10 @@ out_undo_sget:
 * This supports only the legacy single-instance semantics (no
 * multiple-instance semantics)
 */
-static int devpts_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
-                const char *dev_name, void *data, struct vfsmount *mnt)
+                const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
+        return mount_single(fs_type, flags, data, devpts_fill_super);
 }
 #endif
@@ -421,7 +419,7 @@ static void devpts_kill_sb(struct super_block *sb)
 static struct file_system_type devpts_fs_type = {
        .name           = "devpts",
-        .get_sb         = devpts_get_sb,
+        .mount          = devpts_mount,
        .kill_sb        = devpts_kill_sb,
 };
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 48d74c7391d1..85882f6ba5f7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
 * filesystems can use it to hold additional state between get_block calls and
 * dio_complete.
 */
-static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
 {
        ssize_t transferred = 0;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 40186b959429..413a3c48f0bb 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -377,6 +377,7 @@ struct ecryptfs_mount_crypt_stat {
 #define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES      0x00000010
 #define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK   0x00000020
 #define ECRYPTFS_GLOBAL_ENCFN_USE_FEK          0x00000040
+#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY    0x00000080
        u32 flags;
        struct list_head global_auth_tok_list;
        struct mutex global_auth_tok_list_mutex;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 3fbc94203380..9d1a22d62765 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -32,6 +32,7 @@
 #include <linux/crypto.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
+#include <linux/xattr.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
@@ -70,15 +71,19 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
        struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
        struct dentry *dentry_save;
        struct vfsmount *vfsmount_save;
+        unsigned int flags_save;
        int rc;
        dentry_save = nd->path.dentry;
        vfsmount_save = nd->path.mnt;
+        flags_save = nd->flags;
        nd->path.dentry = lower_dentry;
        nd->path.mnt = lower_mnt;
+        nd->flags &= ~LOOKUP_OPEN;
        rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
        nd->path.dentry = dentry_save;
        nd->path.mnt = vfsmount_save;
+        nd->flags = flags_save;
        return rc;
 }
@@ -1108,10 +1113,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                rc = -EOPNOTSUPP;
                goto out;
        }
-        mutex_lock(&lower_dentry->d_inode->i_mutex);
-        rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value,
+        rc = vfs_setxattr(lower_dentry, name, value, size, flags);
-                                                   size, flags);
-        mutex_unlock(&lower_dentry->d_inode->i_mutex);
 out:
        return rc;
 }
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 73811cfa2ea4..b1f6858a5223 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -446,6 +446,7 @@ out:
 */
 static int
 ecryptfs_find_auth_tok_for_sig(
+        struct key **auth_tok_key,
        struct ecryptfs_auth_tok **auth_tok,
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
        char *sig)
@@ -453,12 +454,21 @@ ecryptfs_find_auth_tok_for_sig(
        struct ecryptfs_global_auth_tok *global_auth_tok;
        int rc = 0;
+        (*auth_tok_key) = NULL;
        (*auth_tok) = NULL;
        if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
                                                  mount_crypt_stat, sig)) {
-                struct key *auth_tok_key;
-                rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
+                /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the
+                 * mount_crypt_stat structure, we prevent to use auth toks that
+                 * are not inserted through the ecryptfs_add_global_auth_tok
+                 * function.
+                 */
+                if (mount_crypt_stat->flags
+                                & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+                        return -EINVAL;
+                rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok,
                                                       sig);
        } else
                (*auth_tok) = global_auth_tok->global_auth_tok;
@@ -509,6 +519,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                             char *filename, size_t filename_size)
 {
        struct ecryptfs_write_tag_70_packet_silly_stack *s;
+        struct key *auth_tok_key = NULL;
        int rc = 0;
        s = kmalloc(sizeof(*s), GFP_KERNEL);
@@ -606,6 +617,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
        }
        dest[s->i++] = s->cipher_code;
        rc = ecryptfs_find_auth_tok_for_sig(
+                &auth_tok_key,
                &s->auth_tok, mount_crypt_stat,
                mount_crypt_stat->global_default_fnek_sig);
        if (rc) {
@@ -753,6 +765,8 @@ out_free_unlock:
 out_unlock:
        mutex_unlock(s->tfm_mutex);
 out:
+        if (auth_tok_key)
+                key_put(auth_tok_key);
        kfree(s);
        return rc;
 }
@@ -798,6 +812,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
                             char *data, size_t max_packet_size)
 {
        struct ecryptfs_parse_tag_70_packet_silly_stack *s;
+        struct key *auth_tok_key = NULL;
        int rc = 0;
        (*packet_size) = 0;
@@ -910,7 +925,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
         * >= ECRYPTFS_MAX_IV_BYTES. */
        memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
        s->desc.info = s->iv;
-        rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
+        rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+                                            &s->auth_tok, mount_crypt_stat,
                                            s->fnek_sig_hex);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to find auth tok for "
@@ -986,6 +1002,8 @@ out:
                (*filename_size) = 0;
                (*filename) = NULL;
        }
+        if (auth_tok_key)
+                key_put(auth_tok_key);
        kfree(s);
        return rc;
 }
@@ -1557,14 +1575,19 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
                       ECRYPTFS_VERSION_MAJOR,
                       ECRYPTFS_VERSION_MINOR);
                rc = -EINVAL;
-                goto out;
+                goto out_release_key;
        }
        if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
            && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
                printk(KERN_ERR "Invalid auth_tok structure "
                       "returned from key query\n");
                rc = -EINVAL;
-                goto out;
+                goto out_release_key;
+        }
+out_release_key:
+        if (rc) {
+                key_put(*auth_tok_key);
+                (*auth_tok_key) = NULL;
        }
 out:
        return rc;
@@ -1688,6 +1711,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
        struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
        size_t tag_11_contents_size;
        size_t tag_11_packet_size;
+        struct key *auth_tok_key = NULL;
        int rc = 0;
        INIT_LIST_HEAD(&auth_tok_list);
@@ -1784,6 +1808,10 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
         * just one will be sufficient to decrypt to get the FEK. */
 find_next_matching_auth_tok:
        found_auth_tok = 0;
+        if (auth_tok_key) {
+                key_put(auth_tok_key);
+                auth_tok_key = NULL;
+        }
        list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) {
                candidate_auth_tok = &auth_tok_list_item->auth_tok;
                if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1800,10 +1828,11 @@ find_next_matching_auth_tok:
                        rc = -EINVAL;
                        goto out_wipe_list;
                }
-                ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
+                rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+                                               &matching_auth_tok,
                                               crypt_stat->mount_crypt_stat,
                                               candidate_auth_tok_sig);
-                if (matching_auth_tok) {
+                if (!rc) {
                        found_auth_tok = 1;
                        goto found_matching_auth_tok;
                }
@@ -1866,6 +1895,8 @@ found_matching_auth_tok:
 out_wipe_list:
        wipe_auth_tok_list(&auth_tok_list);
 out:
+        if (auth_tok_key)
+                key_put(auth_tok_key);
        return rc;
 }
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index cbd4e18adb20..a9dbd62518e6 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -208,7 +208,8 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
       ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
-       ecryptfs_opt_unlink_sigs, ecryptfs_opt_err };
+       ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
+       ecryptfs_opt_err };
 static const match_table_t tokens = {
        {ecryptfs_opt_sig, "sig=%s"},
@@ -223,6 +224,7 @@ static const match_table_t tokens = {
        {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
        {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
        {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
+        {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
        {ecryptfs_opt_err, NULL}
 };
@@ -406,6 +408,10 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
                case ecryptfs_opt_unlink_sigs:
                        mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
                        break;
+                case ecryptfs_opt_mount_auth_tok_only:
+                        mount_crypt_stat->flags |=
+                                ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
+                        break;
                case ecryptfs_opt_err:
                default:
                        printk(KERN_WARNING
@@ -540,9 +546,8 @@ out:
 *                        ecryptfs_interpose to perform most of the linking
 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
 */
-static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
-                        const char *dev_name, void *raw_data,
+                        const char *dev_name, void *raw_data)
-                        struct vfsmount *mnt)
 {
        struct super_block *s;
        struct ecryptfs_sb_info *sbi;
@@ -607,8 +612,7 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
                err = "Reading sb failed";
                goto out;
        }
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 out:
        if (sbi) {
@@ -616,7 +620,7 @@ out:
                kmem_cache_free(ecryptfs_sb_info_cache, sbi);
        }
        printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
-        return rc;
+        return ERR_PTR(rc);
 }
 /**
@@ -639,7 +643,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
 static struct file_system_type ecryptfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "ecryptfs",
-        .get_sb = ecryptfs_get_sb,
+        .mount = ecryptfs_mount,
        .kill_sb = ecryptfs_kill_block_super,
        .fs_flags = 0
 };
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index f7fc286a3aa9..253732382d37 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -180,6 +180,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_printf(m, ",ecryptfs_encrypted_view");
        if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
                seq_printf(m, ",ecryptfs_unlink_sigs");
+        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+                seq_printf(m, ",ecryptfs_mount_auth_tok_only");
        return 0;
 }
diff --git a/fs/efs/super.c b/fs/efs/super.c
index f04942810818..5073a07652cc 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -20,16 +20,16 @@
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
-static int efs_get_sb(struct file_system_type *fs_type,
+static struct dentry *efs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
 }
 static struct file_system_type efs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "efs",
-        .get_sb         = efs_get_sb,
+        .mount          = efs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 256bb7bb102a..8cf07242067d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -77,9 +77,6 @@
 /* Maximum number of nesting allowed inside epoll sets */
 #define EP_MAX_NESTS 4
-/* Maximum msec timeout value storeable in a long int */
-#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
 #define EP_UNACTIVE_PTR ((void *) -1L)
@@ -1117,18 +1114,22 @@ static int ep_send_events(struct eventpoll *ep,
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                   int maxevents, long timeout)
 {
-        int res, eavail;
+        int res, eavail, timed_out = 0;
        unsigned long flags;
-        long jtimeout;
+        long slack;
        wait_queue_t wait;
+        struct timespec end_time;
-        /*
+        ktime_t expires, *to = NULL;
-         * Calculate the timeout by checking for the "infinite" value (-1)
-         * and the overflow condition. The passed timeout is in milliseconds,
+        if (timeout > 0) {
-         * that why (t * HZ) / 1000.
+                ktime_get_ts(&end_time);
-         */
+                timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC);
-        jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
+                slack = select_estimate_accuracy(&end_time);
-                MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
+                to = &expires;
+                *to = timespec_to_ktime(end_time);
+        } else if (timeout == 0) {
+                timed_out = 1;
+        }
 retry:
        spin_lock_irqsave(&ep->lock, flags);
@@ -1150,7 +1151,7 @@ retry:
                         * to TASK_INTERRUPTIBLE before doing the checks.
                         */
                        set_current_state(TASK_INTERRUPTIBLE);
-                        if (!list_empty(&ep->rdllist) || !jtimeout)
+                        if (!list_empty(&ep->rdllist) || timed_out)
                                break;
                        if (signal_pending(current)) {
                                res = -EINTR;
@@ -1158,7 +1159,9 @@ retry:
                        }
                        spin_unlock_irqrestore(&ep->lock, flags);
-                        jtimeout = schedule_timeout(jtimeout);
+                        if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+                                timed_out = 1;
                        spin_lock_irqsave(&ep->lock, flags);
                }
                __remove_wait_queue(&ep->wq, &wait);
@@ -1176,7 +1179,7 @@ retry:
         * more luck.
         */
        if (!res && eavail &&
-            !(res = ep_send_events(ep, events, maxevents)) && jtimeout)
+            !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
                goto retry;
        return res;
diff --git a/fs/exec.c b/fs/exec.c
index 6d2b6f936858..99d33a1371e9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -54,6 +54,7 @@
 #include <linux/fsnotify.h>
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -65,6 +66,12 @@ char core_pattern[CORENAME_MAX_SIZE] = "core";
 unsigned int core_pipe_limit;
 int suid_dumpable = 0;
+struct core_name {
+        char *corename;
+        int used, size;
+};
+static atomic_t call_count = ATOMIC_INIT(1);
 /* The maximal length of core_pattern is also specified in sysctl.c */
 static LIST_HEAD(formats);
@@ -759,6 +766,10 @@ static int exec_mmap(struct mm_struct *mm)
        tsk->mm = mm;
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
+        if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+                atomic_dec(&old_mm->oom_disable_count);
+                atomic_inc(&tsk->mm->oom_disable_count);
+        }
        task_unlock(tsk);
        arch_pick_mmap_layout(mm);
        if (old_mm) {
@@ -998,7 +1009,7 @@ int flush_old_exec(struct linux_binprm * bprm)
        bprm->mm = NULL;                /* We're using it now */
-        current->flags &= ~PF_RANDOMIZE;
+        current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
        flush_thread();
        current->personality &= ~bprm->per_clear;
@@ -1078,14 +1089,14 @@ EXPORT_SYMBOL(setup_new_exec);
 */
 int prepare_bprm_creds(struct linux_binprm *bprm)
 {
-        if (mutex_lock_interruptible(&current->cred_guard_mutex))
+        if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
                return -ERESTARTNOINTR;
        bprm->cred = prepare_exec_creds();
        if (likely(bprm->cred))
                return 0;
-        mutex_unlock(&current->cred_guard_mutex);
+        mutex_unlock(&current->signal->cred_guard_mutex);
        return -ENOMEM;
 }
@@ -1093,7 +1104,7 @@ void free_bprm(struct linux_binprm *bprm)
 {
        free_arg_pages(bprm);
        if (bprm->cred) {
-                mutex_unlock(&current->cred_guard_mutex);
+                mutex_unlock(&current->signal->cred_guard_mutex);
                abort_creds(bprm->cred);
        }
        kfree(bprm);
@@ -1114,13 +1125,13 @@ void install_exec_creds(struct linux_binprm *bprm)
         * credentials; any time after this it may be unlocked.
         */
        security_bprm_committed_creds(bprm);
-        mutex_unlock(&current->cred_guard_mutex);
+        mutex_unlock(&current->signal->cred_guard_mutex);
 }
 EXPORT_SYMBOL(install_exec_creds);
 /*
 * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_guard_mutex to protect against
+ * - the caller must hold ->cred_guard_mutex to protect against
 *   PTRACE_ATTACH
 */
 int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1401,7 +1412,6 @@ int do_execve(const char * filename,
        if (retval < 0)
                goto out;
-        current->flags &= ~PF_KTHREAD;
        retval = search_binary_handler(bprm,regs);
        if (retval < 0)
                goto out;
@@ -1454,127 +1464,148 @@ void set_binfmt(struct linux_binfmt *new)
 EXPORT_SYMBOL(set_binfmt);
+static int expand_corename(struct core_name *cn)
+{
+        char *old_corename = cn->corename;
+        cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
+        cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
+        if (!cn->corename) {
+                kfree(old_corename);
+                return -ENOMEM;
+        }
+        return 0;
+}
+static int cn_printf(struct core_name *cn, const char *fmt, ...)
+{
+        char *cur;
+        int need;
+        int ret;
+        va_list arg;
+        va_start(arg, fmt);
+        need = vsnprintf(NULL, 0, fmt, arg);
+        va_end(arg);
+        if (likely(need < cn->size - cn->used - 1))
+                goto out_printf;
+        ret = expand_corename(cn);
+        if (ret)
+                goto expand_fail;
+out_printf:
+        cur = cn->corename + cn->used;
+        va_start(arg, fmt);
+        vsnprintf(cur, need + 1, fmt, arg);
+        va_end(arg);
+        cn->used += need;
+        return 0;
+expand_fail:
+        return ret;
+}
 /* format_corename will inspect the pattern parameter, and output a
 * name into corename, which must have space for at least
 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
 */
-static int format_corename(char *corename, long signr)
+static int format_corename(struct core_name *cn, long signr)
 {
        const struct cred *cred = current_cred();
        const char *pat_ptr = core_pattern;
        int ispipe = (*pat_ptr == '|');
-        char *out_ptr = corename;
-        char *const out_end = corename + CORENAME_MAX_SIZE;
-        int rc;
        int pid_in_pattern = 0;
+        int err = 0;
+        cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
+        cn->corename = kmalloc(cn->size, GFP_KERNEL);
+        cn->used = 0;
+        if (!cn->corename)
+                return -ENOMEM;
        /* Repeat as long as we have more pattern to process and more output
           space */
        while (*pat_ptr) {
                if (*pat_ptr != '%') {
-                        if (out_ptr == out_end)
+                        if (*pat_ptr == 0)
                                goto out;
-                        *out_ptr++ = *pat_ptr++;
+                        err = cn_printf(cn, "%c", *pat_ptr++);
                } else {
                        switch (*++pat_ptr) {
+                        /* single % at the end, drop that */
                        case 0:
                                goto out;
                        /* Double percent, output one percent */
                        case '%':
-                                if (out_ptr == out_end)
+                                err = cn_printf(cn, "%c", '%');
-                                        goto out;
-                                *out_ptr++ = '%';
                                break;
                        /* pid */
                        case 'p':
                                pid_in_pattern = 1;
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%d",
-                                              "%d", task_tgid_vnr(current));
+                                              task_tgid_vnr(current));
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* uid */
                        case 'u':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%d", cred->uid);
-                                              "%d", cred->uid);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* gid */
                        case 'g':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%d", cred->gid);
-                                              "%d", cred->gid);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* signal that caused the coredump */
                        case 's':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%ld", signr);
-                                              "%ld", signr);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* UNIX time of coredump */
                        case 't': {
                                struct timeval tv;
                                do_gettimeofday(&tv);
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%lu", tv.tv_sec);
-                                              "%lu", tv.tv_sec);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        }
                        /* hostname */
                        case 'h':
                                down_read(&uts_sem);
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%s",
-                                              "%s", utsname()->nodename);
+                                              utsname()->nodename);
                                up_read(&uts_sem);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* executable */
                        case 'e':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%s", current->comm);
-                                              "%s", current->comm);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* core limit size */
                        case 'c':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%lu",
-                                              "%lu", rlimit(RLIMIT_CORE));
+                                              rlimit(RLIMIT_CORE));
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        default:
                                break;
                        }
                        ++pat_ptr;
                }
+                if (err)
+                        return err;
        }
        /* Backward compatibility with core_uses_pid:
         *
         * If core_pattern does not include a %p (as is the default)
         * and core_uses_pid is set, then .%pid will be appended to
         * the filename. Do not do this for piped commands. */
        if (!ispipe && !pid_in_pattern && core_uses_pid) {
-                rc = snprintf(out_ptr, out_end - out_ptr,
+                err = cn_printf(cn, ".%d", task_tgid_vnr(current));
-                              ".%d", task_tgid_vnr(current));
+                if (err)
-                if (rc > out_end - out_ptr)
+                        return err;
-                        goto out;
-                out_ptr += rc;
        }
 out:
-        *out_ptr = 0;
        return ispipe;
 }
@@ -1851,7 +1882,7 @@ static int umh_pipe_setup(struct subprocess_info *info)
 void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
        struct core_state core_state;
-        char corename[CORENAME_MAX_SIZE + 1];
+        struct core_name cn;
        struct mm_struct *mm = current->mm;
        struct linux_binfmt * binfmt;
        const struct cred *old_cred;
@@ -1906,7 +1937,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
         */
        clear_thread_flag(TIF_SIGPENDING);
-        ispipe = format_corename(corename, signr);
+        ispipe = format_corename(&cn, signr);
+        if (ispipe == -ENOMEM) {
+                printk(KERN_WARNING "format_corename failed\n");
+                printk(KERN_WARNING "Aborting core\n");
+                goto fail_corename;
+        }
        if (ispipe) {
                int dump_count;
@@ -1943,7 +1980,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                        goto fail_dropcount;
                }
-                helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
+                helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
                if (!helper_argv) {
                        printk(KERN_WARNING "%s failed to allocate memory\n",
                               __func__);
@@ -1956,7 +1993,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                argv_free(helper_argv);
                if (retval) {
                        printk(KERN_INFO "Core dump to %s pipe failed\n",
-                               corename);
+                               cn.corename);
                        goto close_fail;
                }
        } else {
@@ -1965,7 +2002,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                if (cprm.limit < binfmt->min_coredump)
                        goto fail_unlock;
-                cprm.file = filp_open(corename,
+                cprm.file = filp_open(cn.corename,
                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
                                 0600);
                if (IS_ERR(cprm.file))
@@ -2007,6 +2044,8 @@ fail_dropcount:
        if (ispipe)
                atomic_dec(&core_dump_count);
 fail_unlock:
+        kfree(cn.corename);
+fail_corename:
        coredump_finish(mm);
        revert_creds(old_cred);
 fail_creds:
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index d91e9d829bc1..dcc941d82d67 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -420,7 +420,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
        err = exofs_write_begin(NULL, page->mapping, pos, len,
                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err)
-                EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
+                EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
                          err);
        de->inode_no = cpu_to_le64(inode->i_ino);
@@ -556,7 +556,7 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
        err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
                                                        &page, NULL);
        if (err)
-                EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
+                EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
                          err);
        if (pde)
                pde->rec_len = cpu_to_le16(to - from);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 68cb23e3bb98..b905c79b4f0a 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -46,10 +46,6 @@ static int exofs_file_fsync(struct file *filp, int datasync)
 {
        int ret;
        struct inode *inode = filp->f_mapping->host;
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 0, /* metadata-only; caller takes care of data */
-        };
        struct super_block *sb;
        if (!(inode->i_state & I_DIRTY))
@@ -57,7 +53,7 @@ static int exofs_file_fsync(struct file *filp, int datasync)
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                return 0;
-        ret = sync_inode(inode, &wbc);
+        ret = sync_inode_metadata(inode, 1);
        /* This is a good place to write the sb */
        /* TODO: Sechedule an sb-sync on create */
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 44602754f758..42685424817b 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -185,7 +185,7 @@ static void update_write_page(struct page *page, int ret)
 /* Called at the end of reads, to optionally unlock pages and update their
 * status.
 */
-static int __readpages_done(struct page_collect *pcol, bool do_unlock)
+static int __readpages_done(struct page_collect *pcol)
 {
        int i;
        u64 resid;
@@ -221,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
                          page_stat ? "bad_bytes" : "good_bytes");
                ret = update_read_page(page, page_stat);
-                if (do_unlock)
+                if (!pcol->read_4_write)
                        unlock_page(page);
                length += PAGE_SIZE;
        }
@@ -236,7 +236,7 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
 {
        struct page_collect *pcol = p;
-        __readpages_done(pcol, true);
+        __readpages_done(pcol);
        atomic_dec(&pcol->sbi->s_curr_pending);
        kfree(pcol);
 }
@@ -257,7 +257,7 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
        }
 }
-static int read_exec(struct page_collect *pcol, bool is_sync)
+static int read_exec(struct page_collect *pcol)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
        struct exofs_io_state *ios = pcol->ios;
@@ -267,17 +267,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
        if (!pcol->pages)
                return 0;
-        /* see comment in _readpage() about sync reads */
-        WARN_ON(is_sync && (pcol->nr_pages != 1));
        ios->pages = pcol->pages;
        ios->nr_pages = pcol->nr_pages;
        ios->length = pcol->length;
        ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
-        if (is_sync) {
+        if (pcol->read_4_write) {
                exofs_oi_read(oi, pcol->ios);
-                return __readpages_done(pcol, false);
+                return __readpages_done(pcol);
        }
        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -303,7 +300,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
        return 0;
 err:
-        if (!is_sync)
+        if (!pcol->read_4_write)
                _unlock_pcol_pages(pcol, ret, READ);
        pcol_free(pcol);
@@ -356,7 +353,7 @@ static int readpage_strip(void *data, struct page *page)
                EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
                             " splitting\n", inode->i_ino, page->index);
-                return read_exec(pcol, false);
+                return read_exec(pcol);
        }
 try_again:
@@ -366,7 +363,7 @@ try_again:
        } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
                   page->index)) {
                /* Discontinuity detected, split the request */
-                ret = read_exec(pcol, false);
+                ret = read_exec(pcol);
                if (unlikely(ret))
                        goto fail;
                goto try_again;
@@ -391,7 +388,7 @@ try_again:
                          page, len, pcol->nr_pages, pcol->length);
                /* split the request, and start again with current page */
-                ret = read_exec(pcol, false);
+                ret = read_exec(pcol);
                if (unlikely(ret))
                        goto fail;
@@ -420,27 +417,24 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
                return ret;
        }
-        return read_exec(&pcol, false);
+        return read_exec(&pcol);
 }
-static int _readpage(struct page *page, bool is_sync)
+static int _readpage(struct page *page, bool read_4_write)
 {
        struct page_collect pcol;
        int ret;
        _pcol_init(&pcol, 1, page->mapping->host);
-        /* readpage_strip might call read_exec(,is_sync==false) at several
+        pcol.read_4_write = read_4_write;
-         * places but not if we have a single page.
-         */
-        pcol.read_4_write = is_sync;
        ret = readpage_strip(&pcol, page);
        if (ret) {
                EXOFS_ERR("_readpage => %d\n", ret);
                return ret;
        }
-        return read_exec(&pcol, is_sync);
+        return read_exec(&pcol);
 }
 /*
@@ -1036,6 +1030,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
                memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
        }
+        inode->i_mapping->backing_dev_info = sb->s_bdi;
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &exofs_file_inode_operations;
                inode->i_fop = &exofs_file_operations;
@@ -1072,8 +1067,10 @@ bad_inode:
 int __exofs_wait_obj_created(struct exofs_i_info *oi)
 {
        if (!obj_created(oi)) {
+                EXOFS_DBGMSG("!obj_created\n");
                BUG_ON(!obj_2bcreated(oi));
                wait_event(oi->i_wq, obj_created(oi));
+                EXOFS_DBGMSG("wait_event done\n");
        }
        return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
 }
@@ -1107,7 +1104,6 @@ static void create_done(struct exofs_io_state *ios, void *p)
        set_obj_created(oi);
-        atomic_dec(&inode->i_count);
        wake_up(&oi->i_wq);
 }
@@ -1135,6 +1131,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        sbi = sb->s_fs_info;
+        inode->i_mapping->backing_dev_info = sb->s_bdi;
        sb->s_dirt = 1;
        inode_init_owner(inode, dir, mode);
        inode->i_ino = sbi->s_nextid++;
@@ -1157,17 +1154,11 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        ios->obj.id = exofs_oi_objno(oi);
        exofs_make_credential(oi->i_cred, &ios->obj);
-        /* increment the refcount so that the inode will still be around when we
-         * reach the callback
-         */
-        atomic_inc(&inode->i_count);
        ios->done = create_done;
        ios->private = inode;
        ios->cred = oi->i_cred;
        ret = exofs_sbi_create(ios);
        if (ret) {
-                atomic_dec(&inode->i_count);
                exofs_put_io_state(ios);
                return ERR_PTR(ret);
        }
@@ -1257,12 +1248,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        ios->out_attr_len = 1;
        ios->out_attr = &attr;
-        if (!obj_created(oi)) {
+        wait_obj_created(oi);
-                EXOFS_DBGMSG("!obj_created\n");
-                BUG_ON(!obj_2bcreated(oi));
-                wait_event(oi->i_wq, obj_created(oi));
-                EXOFS_DBGMSG("wait_event done\n");
-        }
        if (!do_sync) {
                args->sbi = sbi;
@@ -1325,12 +1311,12 @@ void exofs_evict_inode(struct inode *inode)
        inode->i_size = 0;
        end_writeback(inode);
-        /* if we are deleting an obj that hasn't been created yet, wait */
+        /* if we are deleting an obj that hasn't been created yet, wait.
-        if (!obj_created(oi)) {
+         * This also makes sure that create_done cannot be called with an
-                BUG_ON(!obj_2bcreated(oi));
+         * already evicted inode.
-                wait_event(oi->i_wq, obj_created(oi));
+         */
-                /* ignore the error attempt a remove anyway */
+        wait_obj_created(oi);
-        }
+        /* ignore the error, attempt a remove anyway */
        /* Now Remove the OSD objects */
        ret = exofs_get_io_state(&sbi->layout, &ios);
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index b7dd0c236863..264e95d02830 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
        inode->i_ctime = CURRENT_TIME;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        return exofs_add_nondir(dentry, inode);
 }
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 047e92fa3af8..79c3ae6e0456 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -659,19 +659,19 @@ free_bdi:
 /*
 * Set up the superblock (calls exofs_fill_super eventually)
 */
-static int exofs_get_sb(struct file_system_type *type,
+static struct dentry *exofs_mount(struct file_system_type *type,
                          int flags, const char *dev_name,
-                          void *data, struct vfsmount *mnt)
+                          void *data)
 {
        struct exofs_mountopt opts;
        int ret;
        ret = parse_options(data, &opts);
        if (ret)
-                return ret;
+                return ERR_PTR(ret);
        opts.dev_name = dev_name;
-        return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
+        return mount_nodev(type, flags, &opts, exofs_fill_super);
 }
 /*
@@ -809,7 +809,7 @@ static const struct export_operations exofs_export_ops = {
 static struct file_system_type exofs_type = {
        .owner          = THIS_MODULE,
        .name           = "exofs",
-        .get_sb         = exofs_get_sb,
+        .mount          = exofs_mount,
        .kill_sb        = generic_shutdown_super,
 };
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e9e175949a63..51b304056f10 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -74,21 +74,20 @@ static struct dentry *
 find_disconnected_root(struct dentry *dentry)
 {
        dget(dentry);
-        spin_lock(&dentry->d_lock);
+        while (!IS_ROOT(dentry)) {
-        while (!IS_ROOT(dentry) &&
+                struct dentry *parent = dget_parent(dentry);
-               (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) {
-                struct dentry *parent = dentry->d_parent;
+                if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
-                dget(parent);
+                        dput(parent);
-                spin_unlock(&dentry->d_lock);
+                        break;
+                }
                dput(dentry);
                dentry = parent;
-                spin_lock(&dentry->d_lock);
        }
-        spin_unlock(&dentry->d_lock);
        return dentry;
 }
 /*
 * Make sure target_dir is fully connected to the dentry tree.
 *
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index c6c684b44ea1..0d06f4e75699 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -646,10 +646,9 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
        return here;
 }
-/*
+/**
 * ext2_try_to_allocate()
 * @sb:                 superblock
- * @handle:             handle to this transaction
 * @group:              given allocation block group
 * @bitmap_bh:          bufferhead holds the block bitmap
 * @grp_goal:           given target block within the group
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 764109886ec0..2709b34206ab 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -98,7 +98,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
        if (IS_DIRSYNC(dir)) {
                err = write_one_page(page, 1);
                if (!err)
-                        err = ext2_sync_inode(dir);
+                        err = sync_inode_metadata(dir, 1);
        } else {
                unlock_page(page);
        }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 416daa62242c..6346a2acf326 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -120,7 +120,6 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
 extern struct inode *ext2_iget (struct super_block *, unsigned long);
 extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_evict_inode(struct inode *);
-extern int ext2_sync_inode (struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 533699c16040..40ad210a5049 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1203,7 +1203,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
        if (inode_needs_sync(inode)) {
                sync_mapping_buffers(inode->i_mapping);
-                ext2_sync_inode (inode);
+                sync_inode_metadata(inode, 1);
        } else {
                mark_inode_dirty(inode);
        }
@@ -1523,15 +1523,6 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
        return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
-int ext2_sync_inode(struct inode *inode)
-{
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 0,       /* sys_fsync did this */
-        };
-        return sync_inode(inode, &wbc);
-}
 int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
        struct inode *inode = dentry->d_inode;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 71efb0e9a3f2..f8aecd2e3297 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -206,7 +206,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = ext2_add_link(dentry, inode);
        if (!err) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 85df87d0f7b7..d89e0b6a2d78 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1221,9 +1221,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        }
        es = sbi->s_es;
-        if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
+        if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
-            (old_mount_opt & EXT2_MOUNT_XIP)) &&
-            invalidate_inodes(sb)) {
                ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
                         "xip flag with busy inodes while remounting");
                sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
@@ -1358,10 +1356,10 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
        return 0;
 }
-static int ext2_get_sb(struct file_system_type *fs_type,
+static struct dentry *ext2_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
 }
 #ifdef CONFIG_QUOTA
@@ -1475,7 +1473,7 @@ out:
 static struct file_system_type ext2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext2",
-        .get_sb         = ext2_get_sb,
+        .mount          = ext2_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 8c29ae15129e..f84700be3274 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -699,7 +699,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
        EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
        inode->i_ctime = CURRENT_TIME_SEC;
        if (IS_SYNC(inode)) {
-                error = ext2_sync_inode (inode);
+                error = sync_inode_metadata(inode, 1);
                /* In case sync failed due to ENOSPC the inode was actually
                 * written (only some dirty data were not) so we just proceed
                 * as if nothing happened and cleanup the unused block */
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 4a32511f4ded..b3db22649426 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -792,9 +792,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
        if (here < 0)
                here = 0;
-        p = ((char *)bh->b_data) + (here >> 3);
+        p = bh->b_data + (here >> 3);
        r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
-        next = (r - ((char *)bh->b_data)) << 3;
+        next = (r - bh->b_data) << 3;
        if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
                return next;
@@ -810,8 +810,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 /**
 * claim_block()
+ * @lock:               the spin lock for this block group
 * @block:              the free block (group relative) to allocate
- * @bh:                 the bufferhead containts the block group bitmap
+ * @bh:                 the buffer_head contains the block group bitmap
 *
 * We think we can allocate this block in this bitmap.  Try to set the bit.
 * If that succeeds then check that nobody has allocated and then freed the
@@ -956,9 +957,11 @@ fail_access:
 *              but we will shift to the place where start_block is,
 *              then start from there, when looking for a reservable space.
 *
- *      @size: the target new reservation window size
+ *      @my_rsv: the reservation window
 *
- *      @group_first_block: the first block we consider to start
+ *      @sb: the super block
+ *
+ *      @start_block: the first block we consider to start
 *                      the real search from
 *
 *      @last_block:
@@ -1084,7 +1087,7 @@ static int find_next_reservable_window(
 *
 *      failed: we failed to find a reservation window in this group
 *
- *      @rsv: the reservation
+ *      @my_rsv: the reservation window
 *
 *      @grp_goal: The goal (group-relative).  It is where the search for a
 *              free reservable space should start from.
@@ -1273,8 +1276,8 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
 * @group:              given allocation block group
 * @bitmap_bh:          bufferhead holds the block bitmap
 * @grp_goal:           given target block within the group
- * @count:              target number of blocks to allocate
 * @my_rsv:             reservation window
+ * @count:              target number of blocks to allocate
 * @errp:               pointer to store the error code
 *
 * This is the main function used to allocate a new block and its reservation
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 4ab72db3559e..9724aef22460 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -570,9 +570,14 @@ got:
        ei->i_state_flags = 0;
        ext3_set_inode_state(inode, EXT3_STATE_NEW);
-        ei->i_extra_isize =
+        /* See comment in ext3_iget for explanation */
-                (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
+        if (ino >= EXT3_FIRST_INO(sb) + 1 &&
-                sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
+            EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
+                ei->i_extra_isize =
+                        sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
+        } else {
+                ei->i_extra_isize = 0;
+        }
        ret = inode;
        dquot_initialize(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5e0faf4cda79..a9580617edd2 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -498,7 +498,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
 }
 /**
- *      ext3_blks_to_allocate: Look up the block map and count the number
+ *      ext3_blks_to_allocate - Look up the block map and count the number
 *      of direct blocks need to be allocated for the given branch.
 *
 *      @branch: chain of indirect blocks
@@ -536,14 +536,18 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 }
 /**
- *      ext3_alloc_blocks: multiple allocate blocks needed for a branch
+ *      ext3_alloc_blocks - multiple allocate blocks needed for a branch
+ *      @handle: handle for this transaction
+ *      @inode: owner
+ *      @goal: preferred place for allocation
 *      @indirect_blks: the number of blocks need to allocate for indirect
 *                      blocks
- *
+ *      @blks:  number of blocks need to allocated for direct blocks
 *      @new_blocks: on return it will store the new block numbers for
 *      the indirect blocks(if needed) and the first direct block,
- *      @blks:  on return it will store the total number of allocated
+ *      @err: here we store the error value
- *              direct blocks
+ *
+ *      return the number of direct blocks allocated
 */
 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
                        ext3_fsblk_t goal, int indirect_blks, int blks,
@@ -598,9 +602,11 @@ failed_out:
 /**
 *      ext3_alloc_branch - allocate and set up a chain of blocks.
+ *      @handle: handle for this transaction
 *      @inode: owner
 *      @indirect_blks: number of allocated indirect blocks
 *      @blks: number of allocated direct blocks
+ *      @goal: preferred place for allocation
 *      @offsets: offsets (in the blocks) to store the pointers to next.
 *      @branch: place to store the chain in.
 *
@@ -700,10 +706,9 @@ failed:
 /**
 * ext3_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
 * @inode: owner
 * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *      ext3_alloc_branch)
 * @where: location of missing link
 * @num:   number of indirect blocks we are adding
 * @blks:  number of direct blocks we are adding
@@ -1696,8 +1701,8 @@ static int ext3_journalled_writepage(struct page *page,
                 * doesn't seem much point in redirtying the page here.
                 */
                ClearPageChecked(page);
-                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
-                                        ext3_get_block);
+                                          ext3_get_block);
                if (ret != 0) {
                        ext3_journal_stop(handle);
                        goto out_unlock;
@@ -2530,7 +2535,6 @@ void ext3_truncate(struct inode *inode)
                         */
                } else {
                        /* Shared branch grows from an indirect block */
-                        BUFFER_TRACE(partial->bh, "get_write_access");
                        ext3_free_branches(handle, inode, partial->bh,
                                        partial->p,
                                        partial->p+1, (chain+n-1) - partial);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 2b35ddb70d65..bce9dce639b8 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2260,7 +2260,7 @@ retry:
        inode->i_ctime = CURRENT_TIME_SEC;
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = ext3_add_entry(handle, dentry, inode);
        if (!err) {
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 0ccd7b12b73c..e746d30b1232 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -977,7 +977,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        o_blocks_count = le32_to_cpu(es->s_blocks_count);
        if (test_opt(sb, DEBUG))
-                printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
+                printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
+                       " upto "E3FSBLK" blocks\n",
                       o_blocks_count, n_blocks_count);
        if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -985,7 +986,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
                printk(KERN_ERR "EXT3-fs: filesystem on %s:"
-                        " too large to resize to %lu blocks safely\n",
+                        " too large to resize to "E3FSBLK" blocks safely\n",
                        sb->s_id, n_blocks_count);
                if (sizeof(sector_t) < 8)
                        ext3_warning(sb, __func__,
@@ -1065,11 +1066,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
        mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
-        ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
+        ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
-                   o_blocks_count + add);
+                   o_blocks_count, o_blocks_count + add);
        ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-        ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count,
+        ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
-                   o_blocks_count + add);
+                   o_blocks_count, o_blocks_count + add);
        if ((err = ext3_journal_stop(handle)))
                goto exit_put;
        if (test_opt(sb, DEBUG))
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 377768009106..2fedaf8b5012 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1301,9 +1301,9 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
                ext3_msg(sb, KERN_WARNING,
                        "warning: mounting fs with errors, "
                        "running e2fsck is recommended");
-        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
-                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+                        le16_to_cpu(es->s_max_mnt_count))
                ext3_msg(sb, KERN_WARNING,
                        "warning: maximal mount count reached, "
                        "running e2fsck is recommended");
@@ -1320,7 +1320,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
                   valid forever! :) */
        es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
 #endif
-        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+        if (!le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
        es->s_mtime = cpu_to_le32(get_seconds());
@@ -1647,7 +1647,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
         * Note: s_es must be initialized as soon as possible because
         *       some ext3 macro-instructions depend on its value
         */
-        es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+        es = (struct ext3_super_block *) (bh->b_data + offset);
        sbi->s_es = es;
        sb->s_magic = le16_to_cpu(es->s_magic);
        if (sb->s_magic != EXT3_SUPER_MAGIC)
@@ -1758,7 +1758,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                               "error: can't read superblock on 2nd try");
                        goto failed_mount;
                }
-                es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
+                es = (struct ext3_super_block *)(bh->b_data + offset);
                sbi->s_es = es;
                if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
                        ext3_msg(sb, KERN_ERR,
@@ -1857,13 +1857,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
                               le32_to_cpu(es->s_first_data_block) - 1)
                                       / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
-        db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
+        db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
-                   EXT3_DESC_PER_BLOCK(sb);
        sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
                                    GFP_KERNEL);
        if (sbi->s_group_desc == NULL) {
                ext3_msg(sb, KERN_ERR,
                        "error: not enough memory");
+                ret = -ENOMEM;
                goto failed_mount;
        }
@@ -1951,6 +1951,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        }
        if (err) {
                ext3_msg(sb, KERN_ERR, "error: insufficient memory");
+                ret = err;
                goto failed_mount3;
        }
@@ -2159,7 +2160,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
                goto out_bdev;
        }
-        es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+        es = (struct ext3_super_block *) (bh->b_data + offset);
        if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
            !(le32_to_cpu(es->s_feature_incompat) &
              EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
@@ -2352,6 +2353,21 @@ static int ext3_commit_super(struct super_block *sb,
        if (!sbh)
                return error;
+        if (buffer_write_io_error(sbh)) {
+                /*
+                 * Oh, dear.  A previous attempt to write the
+                 * superblock failed.  This could happen because the
+                 * USB device was yanked out.  Or it could happen to
+                 * be a transient write error and maybe the block will
+                 * be remapped.  Nothing we can do but to retry the
+                 * write and hope for the best.
+                 */
+                ext3_msg(sb, KERN_ERR, "previous I/O error to "
+                       "superblock detected");
+                clear_buffer_write_io_error(sbh);
+                set_buffer_uptodate(sbh);
+        }
        /*
         * If the file system is mounted read-only, don't update the
         * superblock write time.  This avoids updating the superblock
@@ -2368,8 +2384,15 @@ static int ext3_commit_super(struct super_block *sb,
        es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
-        if (sync)
+        if (sync) {
                error = sync_dirty_buffer(sbh);
+                if (buffer_write_io_error(sbh)) {
+                        ext3_msg(sb, KERN_ERR, "I/O error while writing "
+                               "superblock");
+                        clear_buffer_write_io_error(sbh);
+                        set_buffer_uptodate(sbh);
+                }
+        }
        return error;
 }
@@ -2997,16 +3020,16 @@ out:
 #endif
-static int ext3_get_sb(struct file_system_type *fs_type,
+static struct dentry *ext3_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
 }
 static struct file_system_type ext3_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext3",
-        .get_sb         = ext3_get_sb,
+        .mount          = ext3_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8867b2a1e5fe..c947e36eda6c 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_EXT4_FS) += ext4.o
-ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd30799a43ed..14c3af26c671 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -171,7 +171,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 * less than the blocksize * 8 ( which is the size
                 * of bitmap ), set rest of the block bitmap to 1
                 */
-                mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
+                ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
+                                     bh->b_data);
        }
        return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
@@ -489,7 +490,7 @@ error_return:
 * Check if filesystem has nblocks free & available for allocation.
 * On success return 1, return 0 on failure.
 */
-int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
        s64 free_blocks, dirty_blocks, root_blocks;
        struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3db5084db9bd..fac90f3fba80 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -29,16 +29,15 @@ struct ext4_system_zone {
 static struct kmem_cache *ext4_system_zone_cachep;
-int __init init_ext4_system_zone(void)
+int __init ext4_init_system_zone(void)
 {
-        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
+        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
-                                             SLAB_RECLAIM_ACCOUNT);
        if (ext4_system_zone_cachep == NULL)
                return -ENOMEM;
        return 0;
 }
-void exit_ext4_system_zone(void)
+void ext4_exit_system_zone(void)
 {
        kmem_cache_destroy(ext4_system_zone_cachep);
 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 374510f72baa..ece76fb6a40c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -39,7 +39,7 @@ static int ext4_release_dir(struct inode *inode,
                                struct file *filp);
 const struct file_operations ext4_dir_operations = {
-        .llseek         = generic_file_llseek,
+        .llseek         = ext4_llseek,
        .read           = generic_read_dir,
        .readdir        = ext4_readdir,         /* we take BKL. needed?*/
        .unlocked_ioctl = ext4_ioctl,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 889ec9d5e6ad..8b5dd6369f82 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -168,7 +168,20 @@ struct mpage_da_data {
        int pages_written;
        int retval;
 };
-#define EXT4_IO_UNWRITTEN       0x1
+/*
+ * Flags for ext4_io_end->flags
+ */
+#define EXT4_IO_END_UNWRITTEN   0x0001
+#define EXT4_IO_END_ERROR       0x0002
+struct ext4_io_page {
+        struct page     *p_page;
+        int             p_count;
+};
+#define MAX_IO_PAGES 128
 typedef struct ext4_io_end {
        struct list_head        list;           /* per-file finished IO list */
        struct inode            *inode;         /* file being written to */
@@ -179,8 +192,18 @@ typedef struct ext4_io_end {
        struct work_struct      work;           /* data work queue */
        struct kiocb            *iocb;          /* iocb struct for AIO */
        int                     result;         /* error value for AIO */
+        int                     num_io_pages;
+        struct ext4_io_page     *pages[MAX_IO_PAGES];
 } ext4_io_end_t;
+struct ext4_io_submit {
+        int                     io_op;
+        struct bio              *io_bio;
+        ext4_io_end_t           *io_end;
+        struct ext4_io_page     *io_page;
+        sector_t                io_next_block;
+};
 /*
 * Special inodes numbers
 */
@@ -205,6 +228,7 @@ typedef struct ext4_io_end {
 #define EXT4_MIN_BLOCK_SIZE             1024
 #define EXT4_MAX_BLOCK_SIZE             65536
 #define EXT4_MIN_BLOCK_LOG_SIZE         10
+#define EXT4_MAX_BLOCK_LOG_SIZE         16
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE(s)             ((s)->s_blocksize)
 #else
@@ -889,6 +913,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
 #define EXT4_MOUNT_DISCARD              0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INIT_INODE_TABLE     0x80000000 /* Initialize uninitialized itables */
 #define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)                 o |= EXT4_MOUNT_##opt
@@ -1087,7 +1112,6 @@ struct ext4_sb_info {
        struct completion s_kobj_unregister;
        /* Journaling */
-        struct inode *s_journal_inode;
        struct journal_s *s_journal;
        struct list_head s_orphan;
        struct mutex s_orphan_lock;
@@ -1120,10 +1144,7 @@ struct ext4_sb_info {
        /* for buddy allocator */
        struct ext4_group_info ***s_group_info;
        struct inode *s_buddy_cache;
-        long s_blocks_reserved;
-        spinlock_t s_reserve_lock;
        spinlock_t s_md_lock;
-        tid_t s_last_transaction;
        unsigned short *s_mb_offsets;
        unsigned int *s_mb_maxs;
@@ -1141,7 +1162,6 @@ struct ext4_sb_info {
        unsigned long s_mb_last_start;
        /* stats for buddy allocator */
-        spinlock_t s_mb_pa_lock;
        atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
        atomic_t s_bal_success; /* we found long enough chunks */
        atomic_t s_bal_allocated;       /* in blocks */
@@ -1172,6 +1192,11 @@ struct ext4_sb_info {
        /* timer for periodic error stats printing */
        struct timer_list s_err_report;
+        /* Lazy inode table initialization info */
+        struct ext4_li_request *s_li_request;
+        /* Wait multiplier for lazy initialization thread */
+        unsigned int s_li_wait_mult;
 };
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1533,7 +1558,42 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
                        ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
-extern struct proc_dir_entry *ext4_proc_root;
+/*
+ * Timeout and state flag for lazy initialization inode thread.
+ */
+#define EXT4_DEF_LI_WAIT_MULT                   10
+#define EXT4_DEF_LI_MAX_START_DELAY             5
+#define EXT4_LAZYINIT_QUIT                      0x0001
+#define EXT4_LAZYINIT_RUNNING                   0x0002
+/*
+ * Lazy inode table initialization info
+ */
+struct ext4_lazy_init {
+        unsigned long           li_state;
+        wait_queue_head_t       li_wait_daemon;
+        wait_queue_head_t       li_wait_task;
+        struct timer_list       li_timer;
+        struct task_struct      *li_task;
+        struct list_head        li_request_list;
+        struct mutex            li_list_mtx;
+};
+struct ext4_li_request {
+        struct super_block      *lr_super;
+        struct ext4_sb_info     *lr_sbi;
+        ext4_group_t            lr_next_group;
+        struct list_head        lr_request;
+        unsigned long           lr_next_sched;
+        unsigned long           lr_timeout;
+};
+struct ext4_features {
+        struct kobject f_kobj;
+        struct completion f_kobj_unregister;
+};
 /*
 * Function prototypes
@@ -1561,7 +1621,6 @@ extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1605,11 +1664,9 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
+extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-                                       struct buffer_head *bh,
+extern int ext4_init_inode_table(struct super_block *sb,
-                                       ext4_group_t group,
+                                 ext4_group_t group, int barrier);
-                                       struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1620,16 +1677,15 @@ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
 extern void ext4_discard_preallocations(struct inode *);
-extern int __init init_ext4_mballoc(void);
+extern int __init ext4_init_mballoc(void);
-extern void exit_ext4_mballoc(void);
+extern void ext4_exit_mballoc(void);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh, ext4_fsblk_t block,
                             unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
-extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
-extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
-                                                ext4_group_t, int);
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
@@ -1657,13 +1713,11 @@ extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
-extern int flush_completed_IO(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
 /* ioctl.c */
@@ -1960,6 +2014,7 @@ extern const struct file_operations ext4_dir_operations;
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
+extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
@@ -1973,8 +2028,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
 /* block_validity */
 extern void ext4_release_system_zone(struct super_block *sb);
 extern int ext4_setup_system_zone(struct super_block *sb);
-extern int __init init_ext4_system_zone(void);
+extern int __init ext4_init_system_zone(void);
-extern void exit_ext4_system_zone(void);
+extern void ext4_exit_system_zone(void);
 extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
                                 ext4_fsblk_t start_blk,
                                 unsigned int count);
@@ -2002,6 +2057,17 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);
+/* page-io.c */
+extern int __init ext4_init_pageio(void);
+extern void ext4_exit_pageio(void);
+extern void ext4_free_io_end(ext4_io_end_t *io);
+extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern int ext4_end_io_nolock(ext4_io_end_t *io);
+extern void ext4_io_submit(struct ext4_io_submit *io);
+extern int ext4_bio_write_page(struct ext4_io_submit *io,
+                               struct page *page,
+                               int len,
+                               struct writeback_control *wbc);
 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
 enum ext4_state_bits {
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bdb6ce7e2eb4..28ce70fd9cd0 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -225,11 +225,60 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
        ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
 }
+/*
+ * ext4_ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
+{
+        ext4_fsblk_t block;
+        block = le32_to_cpu(ex->ee_start_lo);
+        block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+        return block;
+}
+/*
+ * ext4_idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
+{
+        ext4_fsblk_t block;
+        block = le32_to_cpu(ix->ei_leaf_lo);
+        block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+        return block;
+}
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
+                                         ext4_fsblk_t pb)
+{
+        ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+        ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+                                      0xffff);
+}
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
+                                         ext4_fsblk_t pb)
+{
+        ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+                                     0xffff);
+}
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
                                         sector_t lblocks);
-extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
-extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
-extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
@@ -237,19 +286,9 @@ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
 extern int ext4_can_extents_be_merged(struct inode *inode,
                                      struct ext4_extent *ex1,
                                      struct ext4_extent *ex2);
-extern int ext4_ext_try_to_merge(struct inode *inode,
-                                 struct ext4_ext_path *path,
-                                 struct ext4_extent *);
-extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
-extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
-                                                        ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
                                                        struct ext4_ext_path *);
-extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
-                                                ext4_lblk_t *, ext4_fsblk_t *);
-extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
-                                                ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 06328d3e5717..0554c48cb1fd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,55 +44,6 @@
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
-/*
- * ext_pblock:
- * combine low and high parts of physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
-{
-        ext4_fsblk_t block;
-        block = le32_to_cpu(ex->ee_start_lo);
-        block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
-        return block;
-}
-/*
- * idx_pblock:
- * combine low and high parts of a leaf physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
-{
-        ext4_fsblk_t block;
-        block = le32_to_cpu(ix->ei_leaf_lo);
-        block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
-        return block;
-}
-/*
- * ext4_ext_store_pblock:
- * stores a large physical block number into an extent struct,
- * breaking it into parts
- */
-void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
-{
-        ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-        ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
-/*
- * ext4_idx_store_pblock:
- * stores a large physical block number into an index struct,
- * breaking it into parts
- */
-static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
-{
-        ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
                                            int needed)
@@ -169,7 +120,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                /* try to predict block placement */
                ex = path[depth].p_ext;
                if (ex)
-                        return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+                        return (ext4_ext_pblock(ex) +
+                                (block - le32_to_cpu(ex->ee_block)));
                /* it looks like index is empty;
                 * try to find starting block from index itself */
@@ -354,7 +306,7 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-        ext4_fsblk_t block = ext_pblock(ext);
+        ext4_fsblk_t block = ext4_ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
@@ -363,7 +315,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
 {
-        ext4_fsblk_t block = idx_pblock(ext_idx);
+        ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 }
@@ -463,13 +415,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
        for (k = 0; k <= l; k++, path++) {
                if (path->p_idx) {
                  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
-                            idx_pblock(path->p_idx));
+                            ext4_idx_pblock(path->p_idx));
                } else if (path->p_ext) {
                        ext_debug("  %d:[%d]%d:%llu ",
                                  le32_to_cpu(path->p_ext->ee_block),
                                  ext4_ext_is_uninitialized(path->p_ext),
                                  ext4_ext_get_actual_len(path->p_ext),
-                                  ext_pblock(path->p_ext));
+                                  ext4_ext_pblock(path->p_ext));
                } else
                        ext_debug("  []");
        }
@@ -494,7 +446,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
                ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
                          ext4_ext_is_uninitialized(ex),
-                          ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                          ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
        }
        ext_debug("\n");
 }
@@ -545,7 +497,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
        path->p_idx = l - 1;
        ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
-                  idx_pblock(path->p_idx));
+                  ext4_idx_pblock(path->p_idx));
 #ifdef CHECK_BINSEARCH
        {
@@ -614,7 +566,7 @@ ext4_ext_binsearch(struct inode *inode,
        path->p_ext = l - 1;
        ext_debug("  -> %d:%llu:[%d]%d ",
                        le32_to_cpu(path->p_ext->ee_block),
-                        ext_pblock(path->p_ext),
+                        ext4_ext_pblock(path->p_ext),
                        ext4_ext_is_uninitialized(path->p_ext),
                        ext4_ext_get_actual_len(path->p_ext));
@@ -682,7 +634,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
                ext4_ext_binsearch_idx(inode, path + ppos, block);
-                path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+                path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                path[ppos].p_depth = i;
                path[ppos].p_ext = NULL;
@@ -721,7 +673,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        ext4_ext_binsearch(inode, path + ppos, block);
        /* if not an empty leaf */
        if (path[ppos].p_ext)
-                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
        ext4_ext_show_path(inode, path);
@@ -739,9 +691,9 @@ err:
 * insert new index [@logical;@ptr] into the block at @curp;
 * check where to insert: before @curp or after @curp
 */
-int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
-                                struct ext4_ext_path *curp,
+                                 struct ext4_ext_path *curp,
-                                int logical, ext4_fsblk_t ptr)
+                                 int logical, ext4_fsblk_t ptr)
 {
        struct ext4_extent_idx *ix;
        int len, err;
@@ -917,7 +869,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                        EXT_MAX_EXTENT(path[depth].p_hdr)) {
                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
                                le32_to_cpu(path[depth].p_ext->ee_block),
-                                ext_pblock(path[depth].p_ext),
+                                ext4_ext_pblock(path[depth].p_ext),
                                ext4_ext_is_uninitialized(path[depth].p_ext),
                                ext4_ext_get_actual_len(path[depth].p_ext),
                                newblock);
@@ -1007,7 +959,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
                        ext_debug("%d: move %d:%llu in new index %llu\n", i,
                                        le32_to_cpu(path[i].p_idx->ei_block),
-                                        idx_pblock(path[i].p_idx),
+                                        ext4_idx_pblock(path[i].p_idx),
                                        newblock);
                        /*memmove(++fidx, path[i].p_idx++,
                                        sizeof(struct ext4_extent_idx));
@@ -1146,7 +1098,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
-                  idx_pblock(EXT_FIRST_INDEX(neh)));
+                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
        neh->eh_depth = cpu_to_le16(path->p_depth + 1);
        err = ext4_ext_dirty(handle, inode, curp);
@@ -1232,9 +1184,9 @@ out:
 * returns 0 at @phys
 * return value contains 0 (success) or error code
 */
-int
+static int ext4_ext_search_left(struct inode *inode,
-ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
+                                struct ext4_ext_path *path,
-                        ext4_lblk_t *logical, ext4_fsblk_t *phys)
+                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
        struct ext4_extent_idx *ix;
        struct ext4_extent *ex;
@@ -1286,7 +1238,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
        }
        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
-        *phys = ext_pblock(ex) + ee_len - 1;
+        *phys = ext4_ext_pblock(ex) + ee_len - 1;
        return 0;
 }
@@ -1297,9 +1249,9 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
 * returns 0 at @phys
 * return value contains 0 (success) or error code
 */
-int
+static int ext4_ext_search_right(struct inode *inode,
-ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
+                                 struct ext4_ext_path *path,
-                        ext4_lblk_t *logical, ext4_fsblk_t *phys)
+                                 ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
        struct buffer_head *bh = NULL;
        struct ext4_extent_header *eh;
@@ -1342,7 +1294,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
                        }
                }
                *logical = le32_to_cpu(ex->ee_block);
-                *phys = ext_pblock(ex);
+                *phys = ext4_ext_pblock(ex);
                return 0;
        }
@@ -1357,7 +1309,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
                /* next allocated block in this leaf */
                ex++;
                *logical = le32_to_cpu(ex->ee_block);
-                *phys = ext_pblock(ex);
+                *phys = ext4_ext_pblock(ex);
                return 0;
        }
@@ -1376,7 +1328,7 @@ got_index:
         * follow it and find the closest allocated
         * block to the right */
        ix++;
-        block = idx_pblock(ix);
+        block = ext4_idx_pblock(ix);
        while (++depth < path->p_depth) {
                bh = sb_bread(inode->i_sb, block);
                if (bh == NULL)
@@ -1388,7 +1340,7 @@ got_index:
                        return -EIO;
                }
                ix = EXT_FIRST_INDEX(eh);
-                block = idx_pblock(ix);
+                block = ext4_idx_pblock(ix);
                put_bh(bh);
        }
@@ -1402,7 +1354,7 @@ got_index:
        }
        ex = EXT_FIRST_EXTENT(eh);
        *logical = le32_to_cpu(ex->ee_block);
-        *phys = ext_pblock(ex);
+        *phys = ext4_ext_pblock(ex);
        put_bh(bh);
        return 0;
 }
@@ -1573,7 +1525,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
                return 0;
 #endif
-        if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
+        if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
                return 1;
        return 0;
 }
@@ -1585,9 +1537,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
 * 1 if they got merged.
 */
-int ext4_ext_try_to_merge(struct inode *inode,
+static int ext4_ext_try_to_merge(struct inode *inode,
-                          struct ext4_ext_path *path,
+                                 struct ext4_ext_path *path,
-                          struct ext4_extent *ex)
+                                 struct ext4_extent *ex)
 {
        struct ext4_extent_header *eh;
        unsigned int depth, len;
@@ -1632,9 +1584,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
 * such that there will be no overlap, and then returns 1.
 * If there is no overlap found, it returns 0.
 */
-unsigned int ext4_ext_check_overlap(struct inode *inode,
+static unsigned int ext4_ext_check_overlap(struct inode *inode,
-                                    struct ext4_extent *newext,
+                                           struct ext4_extent *newext,
-                                    struct ext4_ext_path *path)
+                                           struct ext4_ext_path *path)
 {
        ext4_lblk_t b1, b2;
        unsigned int depth, len1;
@@ -1706,11 +1658,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
                && ext4_can_extents_be_merged(inode, ex, newext)) {
                ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
-                                ext4_ext_is_uninitialized(newext),
+                          ext4_ext_is_uninitialized(newext),
-                                ext4_ext_get_actual_len(newext),
+                          ext4_ext_get_actual_len(newext),
-                                le32_to_cpu(ex->ee_block),
+                          le32_to_cpu(ex->ee_block),
-                                ext4_ext_is_uninitialized(ex),
+                          ext4_ext_is_uninitialized(ex),
-                                ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                          ext4_ext_get_actual_len(ex),
+                          ext4_ext_pblock(ex));
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        return err;
@@ -1780,7 +1733,7 @@ has_space:
                /* there is no extent in this leaf, create first one */
                ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
-                                ext_pblock(newext),
+                                ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext));
                path[depth].p_ext = EXT_FIRST_EXTENT(eh);
@@ -1794,7 +1747,7 @@ has_space:
                        ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
                                        "move %d from 0x%p to 0x%p\n",
                                        le32_to_cpu(newext->ee_block),
-                                        ext_pblock(newext),
+                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_uninitialized(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex, len, nearex + 1, nearex + 2);
@@ -1808,7 +1761,7 @@ has_space:
                ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
                                "move %d from 0x%p to 0x%p\n",
                                le32_to_cpu(newext->ee_block),
-                                ext_pblock(newext),
+                                ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext),
                                nearex, len, nearex + 1, nearex + 2);
@@ -1819,7 +1772,7 @@ has_space:
        le16_add_cpu(&eh->eh_entries, 1);
        nearex = path[depth].p_ext;
        nearex->ee_block = newext->ee_block;
-        ext4_ext_store_pblock(nearex, ext_pblock(newext));
+        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
        nearex->ee_len = newext->ee_len;
 merge:
@@ -1845,9 +1798,9 @@ cleanup:
        return err;
 }
-int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
-                        ext4_lblk_t num, ext_prepare_callback func,
+                               ext4_lblk_t num, ext_prepare_callback func,
-                        void *cbdata)
+                               void *cbdata)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_ext_cache cbex;
@@ -1923,7 +1876,7 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                } else {
                        cbex.ec_block = le32_to_cpu(ex->ee_block);
                        cbex.ec_len = ext4_ext_get_actual_len(ex);
-                        cbex.ec_start = ext_pblock(ex);
+                        cbex.ec_start = ext4_ext_pblock(ex);
                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
                }
@@ -2073,7 +2026,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        /* free index block */
        path--;
-        leaf = idx_pblock(path->p_idx);
+        leaf = ext4_idx_pblock(path->p_idx);
        if (unlikely(path->p_hdr->eh_entries == 0)) {
                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
                return -EIO;
@@ -2181,7 +2134,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t start;
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
-                start = ext_pblock(ex) + ee_len - num;
+                start = ext4_ext_pblock(ex) + ee_len - num;
                ext_debug("free last %u blocks starting %llu\n", num, start);
                ext4_free_blocks(handle, inode, 0, start, num, flags);
        } else if (from == le32_to_cpu(ex->ee_block)
@@ -2310,7 +2263,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        goto out;
                ext_debug("new extent: %u:%u:%llu\n", block, num,
-                                ext_pblock(ex));
+                                ext4_ext_pblock(ex));
                ex--;
                ex_ee_block = le32_to_cpu(ex->ee_block);
                ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2421,9 +2374,9 @@ again:
                        struct buffer_head *bh;
                        /* go to the next level */
                        ext_debug("move to level %d (block %llu)\n",
-                                  i + 1, idx_pblock(path[i].p_idx));
+                                  i + 1, ext4_idx_pblock(path[i].p_idx));
                        memset(path + i + 1, 0, sizeof(*path));
-                        bh = sb_bread(sb, idx_pblock(path[i].p_idx));
+                        bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
                        if (!bh) {
                                /* should we reset i_size? */
                                err = -EIO;
@@ -2535,77 +2488,21 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
-static void bi_complete(struct bio *bio, int error)
-{
-        complete((struct completion *)bio->bi_private);
-}
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
+        ext4_fsblk_t ee_pblock;
+        unsigned int ee_len;
        int ret;
-        struct bio *bio;
-        int blkbits, blocksize;
-        sector_t ee_pblock;
-        struct completion event;
-        unsigned int ee_len, len, done, offset;
-        blkbits   = inode->i_blkbits;
-        blocksize = inode->i_sb->s_blocksize;
        ee_len    = ext4_ext_get_actual_len(ex);
-        ee_pblock = ext_pblock(ex);
+        ee_pblock = ext4_ext_pblock(ex);
-        /* convert ee_pblock to 512 byte sectors */
-        ee_pblock = ee_pblock << (blkbits - 9);
-        while (ee_len > 0) {
-                if (ee_len > BIO_MAX_PAGES)
-                        len = BIO_MAX_PAGES;
-                else
-                        len = ee_len;
-                bio = bio_alloc(GFP_NOIO, len);
-                if (!bio)
-                        return -ENOMEM;
-                bio->bi_sector = ee_pblock;
-                bio->bi_bdev   = inode->i_sb->s_bdev;
-                done = 0;
-                offset = 0;
-                while (done < len) {
-                        ret = bio_add_page(bio, ZERO_PAGE(0),
-                                                        blocksize, offset);
-                        if (ret != blocksize) {
-                                /*
-                                 * We can't add any more pages because of
-                                 * hardware limitations.  Start a new bio.
-                                 */
-                                break;
-                        }
-                        done++;
-                        offset += blocksize;
-                        if (offset >= PAGE_CACHE_SIZE)
-                                offset = 0;
-                }
-                init_completion(&event);
+        ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
-                bio->bi_private = &event;
+        if (ret > 0)
-                bio->bi_end_io = bi_complete;
+                ret = 0;
-                submit_bio(WRITE, bio);
-                wait_for_completion(&event);
-                if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+        return ret;
-                        bio_put(bio);
-                        return -EIO;
-                }
-                bio_put(bio);
-                ee_len    -= done;
-                ee_pblock += done  << (blkbits - 9);
-        }
-        return 0;
 }
 #define EXT4_EXT_ZERO_LEN 7
@@ -2651,12 +2548,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = map->m_lblk - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
-        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+        ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
        /*
         * It is safe to convert extent to initialized via explicit
@@ -2675,7 +2572,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zeroed the full extent */
                return allocated;
@@ -2710,7 +2607,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = cpu_to_le16(ee_len - allocated);
                        ext4_ext_mark_uninitialized(ex);
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        ex3 = &newex;
@@ -2725,7 +2622,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                        goto fix_extent_len;
                                ex->ee_block = orig_ex.ee_block;
                                ex->ee_len   = orig_ex.ee_len;
-                                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                                ext4_ext_store_pblock(ex,
+                                        ext4_ext_pblock(&orig_ex));
                                ext4_ext_dirty(handle, inode, path + depth);
                                /* blocks available from map->m_lblk */
                                return allocated;
@@ -2782,7 +2680,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
                        /* blocks available from map->m_lblk */
@@ -2833,7 +2731,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zero out the first half */
                        /* blocks available from map->m_lblk */
@@ -2902,7 +2800,7 @@ insert:
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zero out the first half */
                return allocated;
@@ -2915,7 +2813,7 @@ out:
 fix_extent_len:
        ex->ee_block = orig_ex.ee_block;
        ex->ee_len   = orig_ex.ee_len;
-        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
        ext4_ext_mark_uninitialized(ex);
        ext4_ext_dirty(handle, inode, path + depth);
        return err;
@@ -2973,12 +2871,12 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = map->m_lblk - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
-        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+        ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
        /*
         * It is safe to convert extent to initialized via explicit
@@ -3027,7 +2925,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
                        /* blocks available from map->m_lblk */
@@ -3099,7 +2997,7 @@ insert:
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zero out the first half */
                return allocated;
@@ -3112,7 +3010,7 @@ out:
 fix_extent_len:
        ex->ee_block = orig_ex.ee_block;
        ex->ee_len   = orig_ex.ee_len;
-        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
        ext4_ext_mark_uninitialized(ex);
        ext4_ext_dirty(handle, inode, path + depth);
        return err;
@@ -3180,6 +3078,57 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
                unmap_underlying_metadata(bdev, block + i);
 }
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+                              struct ext4_map_blocks *map,
+                              struct ext4_ext_path *path,
+                              unsigned int len)
+{
+        int i, depth;
+        struct ext4_extent_header *eh;
+        struct ext4_extent *ex, *last_ex;
+        if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+                return 0;
+        depth = ext_depth(inode);
+        eh = path[depth].p_hdr;
+        ex = path[depth].p_ext;
+        if (unlikely(!eh->eh_entries)) {
+                EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+                                 "EOFBLOCKS_FL set");
+                return -EIO;
+        }
+        last_ex = EXT_LAST_EXTENT(eh);
+        /*
+         * We should clear the EOFBLOCKS_FL flag if we are writing the
+         * last block in the last extent in the file.  We test this by
+         * first checking to see if the caller to
+         * ext4_ext_get_blocks() was interested in the last block (or
+         * a block beyond the last block) in the current extent.  If
+         * this turns out to be false, we can bail out from this
+         * function immediately.
+         */
+        if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+            ext4_ext_get_actual_len(last_ex))
+                return 0;
+        /*
+         * If the caller does appear to be planning to write at or
+         * beyond the end of the current extent, we then test to see
+         * if the current extent is the last extent in the file, by
+         * checking to make sure it was reached via the rightmost node
+         * at each level of the tree.
+         */
+        for (i = depth-1; i >= 0; i--)
+                if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+                        return 0;
+        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+        return ext4_mark_inode_dirty(handle, inode);
+}
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
@@ -3206,7 +3155,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * completed
                 */
                if (io)
-                        io->flag = EXT4_IO_UNWRITTEN;
+                        io->flag = EXT4_IO_END_UNWRITTEN;
                else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
@@ -3217,8 +3166,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
                ret = ext4_convert_unwritten_extents_endio(handle, inode,
                                                        path);
-                if (ret >= 0)
+                if (ret >= 0) {
                        ext4_update_inode_fsync_trans(handle, inode, 1);
+                        err = check_eofblocks_fl(handle, inode, map, path,
+                                                 map->m_len);
+                } else
+                        err = ret;
                goto out2;
        }
        /* buffered IO case */
@@ -3244,8 +3197,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        /* buffered write, writepage time, convert*/
        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-        if (ret >= 0)
+        if (ret >= 0) {
                ext4_update_inode_fsync_trans(handle, inode, 1);
+                err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
+                if (err < 0)
+                        goto out2;
+        }
 out:
        if (ret <= 0) {
                err = ret;
@@ -3292,6 +3250,7 @@ out2:
        }
        return err ? err : allocated;
 }
 /*
 * Block allocation/map/preallocation routine for extents based files
 *
@@ -3315,9 +3274,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
-        struct ext4_extent newex, *ex, *last_ex;
+        struct ext4_extent newex, *ex;
        ext4_fsblk_t newblock;
-        int i, err = 0, depth, ret, cache_type;
+        int err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3341,7 +3300,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        /* block is already allocated */
                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
-                                   + ext_pblock(&newex);
+                                   + ext4_ext_pblock(&newex);
                        /* number of remaining blocks in the extent */
                        allocated = ext4_ext_get_actual_len(&newex) -
                                (map->m_lblk - le32_to_cpu(newex.ee_block));
@@ -3379,7 +3338,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        ex = path[depth].p_ext;
        if (ex) {
                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
-                ext4_fsblk_t ee_start = ext_pblock(ex);
+                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;
                /*
@@ -3488,7 +3447,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                        if (io)
-                                io->flag = EXT4_IO_UNWRITTEN;
+                                io->flag = EXT4_IO_END_UNWRITTEN;
                        else
                                ext4_set_inode_state(inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
@@ -3497,44 +3456,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
-        if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
+        err = check_eofblocks_fl(handle, inode, map, path, ar.len);
-                if (unlikely(!eh->eh_entries)) {
+        if (err)
-                        EXT4_ERROR_INODE(inode,
+                goto out2;
-                                         "eh->eh_entries == 0 and "
-                                         "EOFBLOCKS_FL set");
-                        err = -EIO;
-                        goto out2;
-                }
-                last_ex = EXT_LAST_EXTENT(eh);
-                /*
-                 * If the current leaf block was reached by looking at
-                 * the last index block all the way down the tree, and
-                 * we are extending the inode beyond the last extent
-                 * in the current leaf block, then clear the
-                 * EOFBLOCKS_FL flag.
-                 */
-                for (i = depth-1; i >= 0; i--) {
-                        if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
-                                break;
-                }
-                if ((i < 0) &&
-                    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
-                     ext4_ext_get_actual_len(last_ex)))
-                        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-        }
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
                /* free data blocks we just allocated */
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
-                ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
+                ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
                                 ext4_ext_get_actual_len(&newex), 0);
                goto out2;
        }
        /* previous routine could use block we allocated */
-        newblock = ext_pblock(&newex);
+        newblock = ext4_ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
        if (allocated > map->m_len)
                allocated = map->m_len;
@@ -3729,7 +3667,7 @@ retry:
                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
-                                    inode->i_ino, block, max_blocks);
+                                    inode->i_ino, map.m_lblk, max_blocks);
 #endif
                        ext4_mark_inode_dirty(handle, inode);
                        ret2 = ext4_journal_stop(handle);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ee92b66d4558..5a5c55ddceef 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -130,8 +130,50 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
        return dquot_file_open(inode, filp);
 }
+/*
+ * ext4_llseek() copied from generic_file_llseek() to handle both
+ * block-mapped and extent-mapped maxbytes values. This should
+ * otherwise be identical with generic_file_llseek().
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct inode *inode = file->f_mapping->host;
+        loff_t maxbytes;
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+                maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+        else
+                maxbytes = inode->i_sb->s_maxbytes;
+        mutex_lock(&inode->i_mutex);
+        switch (origin) {
+        case SEEK_END:
+                offset += inode->i_size;
+                break;
+        case SEEK_CUR:
+                if (offset == 0) {
+                        mutex_unlock(&inode->i_mutex);
+                        return file->f_pos;
+                }
+                offset += file->f_pos;
+                break;
+        }
+        if (offset < 0 || offset > maxbytes) {
+                mutex_unlock(&inode->i_mutex);
+                return -EINVAL;
+        }
+        if (offset != file->f_pos) {
+                file->f_pos = offset;
+                file->f_version = 0;
+        }
+        mutex_unlock(&inode->i_mutex);
+        return offset;
+}
 const struct file_operations ext4_file_operations = {
-        .llseek         = generic_file_llseek,
+        .llseek         = ext4_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 3f3ff5ee8f9d..c1a7bc923cf6 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,6 +34,89 @@
 #include <trace/events/ext4.h>
+static void dump_completed_IO(struct inode * inode)
+{
+#ifdef  EXT4_DEBUG
+        struct list_head *cur, *before, *after;
+        ext4_io_end_t *io, *io0, *io1;
+        unsigned long flags;
+        if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+                ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
+                return;
+        }
+        ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+        list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
+                cur = &io->list;
+                before = cur->prev;
+                io0 = container_of(before, ext4_io_end_t, list);
+                after = cur->next;
+                io1 = container_of(after, ext4_io_end_t, list);
+                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+                            io, inode->i_ino, io0, io1);
+        }
+        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+#endif
+}
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
+ */
+static int flush_completed_IO(struct inode *inode)
+{
+        ext4_io_end_t *io;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        unsigned long flags;
+        int ret = 0;
+        int ret2 = 0;
+        if (list_empty(&ei->i_completed_io_list))
+                return ret;
+        dump_completed_IO(inode);
+        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        while (!list_empty(&ei->i_completed_io_list)){
+                io = list_entry(ei->i_completed_io_list.next,
+                                ext4_io_end_t, list);
+                /*
+                 * Calling ext4_end_io_nolock() to convert completed
+                 * IO to written.
+                 *
+                 * When ext4_sync_file() is called, run_queue() may already
+                 * about to flush the work corresponding to this io structure.
+                 * It will be upset if it founds the io structure related
+                 * to the work-to-be schedule is freed.
+                 *
+                 * Thus we need to keep the io structure still valid here after
+                 * convertion finished. The io structure has a flag to
+                 * avoid double converting from both fsync and background work
+                 * queue work.
+                 */
+                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+                ret = ext4_end_io_nolock(io);
+                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+                if (ret < 0)
+                        ret2 = ret;
+                else
+                        list_del_init(&io->list);
+        }
+        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        return (ret2 < 0) ? ret2 : 0;
+}
 /*
 * If we're not journaling and this is a just-created file, we have to
 * sync our parent directory (if it was freshly created) since
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 45853e0d1f21..1ce240a23ebb 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -50,7 +50,7 @@
 * need to use it within a single byte (to ensure we get endianness right).
 * We can use memset for the rest of the bitmap as there are no other users.
 */
-void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 {
        int i;
@@ -65,9 +65,10 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 }
 /* Initializes an uninitialized inode bitmap */
-unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
+static unsigned ext4_init_inode_bitmap(struct super_block *sb,
-                                ext4_group_t block_group,
+                                       struct buffer_head *bh,
-                                struct ext4_group_desc *gdp)
+                                       ext4_group_t block_group,
+                                       struct ext4_group_desc *gdp)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -85,7 +86,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
        }
        memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                        bh->b_data);
        return EXT4_INODES_PER_GROUP(sb);
@@ -107,6 +108,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return NULL;
        bitmap_blk = ext4_inode_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
@@ -123,6 +125,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +136,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                return bh;
        }
        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -411,8 +415,8 @@ struct orlov_stats {
 * for a particular block group or flex_bg.  If flex_size is 1, then g
 * is a block group number; otherwise it is flex_bg number.
 */
-void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
-                       int flex_size, struct orlov_stats *stats)
+                            int flex_size, struct orlov_stats *stats)
 {
        struct ext4_group_desc *desc;
        struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
@@ -712,8 +716,17 @@ static int ext4_claim_inode(struct super_block *sb,
 {
        int free = 0, retval = 0, count;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+        /*
+         * We have to be sure that new inode allocation does not race with
+         * inode table initialization, because otherwise we may end up
+         * allocating and writing new inode right before sb_issue_zeroout
+         * takes place and overwriting our new inode with zeroes. So we
+         * take alloc_sem to prevent it.
+         */
+        down_read(&grp->alloc_sem);
        ext4_lock_group(sb, group);
        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
                /* not a free inode */
@@ -724,6 +737,7 @@ static int ext4_claim_inode(struct super_block *sb,
        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                        ino > EXT4_INODES_PER_GROUP(sb)) {
                ext4_unlock_group(sb, group);
+                up_read(&grp->alloc_sem);
                ext4_error(sb, "reserved inode or inode > inodes count - "
                           "block_group = %u, inode=%lu", group,
                           ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +786,7 @@ static int ext4_claim_inode(struct super_block *sb,
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
        ext4_unlock_group(sb, group);
+        up_read(&grp->alloc_sem);
        return retval;
 }
@@ -1205,3 +1220,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
        }
        return count;
 }
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_claim_inode until we are finished.
+ */
+extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+                                 int barrier)
+{
+        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *gdp = NULL;
+        struct buffer_head *group_desc_bh;
+        handle_t *handle;
+        ext4_fsblk_t blk;
+        int num, ret = 0, used_blks = 0;
+        /* This should not happen, but just to be sure check this */
+        if (sb->s_flags & MS_RDONLY) {
+                ret = 1;
+                goto out;
+        }
+        gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+        if (!gdp)
+                goto out;
+        /*
+         * We do not need to lock this, because we are the only one
+         * handling this flag.
+         */
+        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+                goto out;
+        handle = ext4_journal_start_sb(sb, 1);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        down_write(&grp->alloc_sem);
+        /*
+         * If inode bitmap was already initialized there may be some
+         * used inodes so we need to skip blocks with used inodes in
+         * inode table.
+         */
+        if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+                used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+                            ext4_itable_unused_count(sb, gdp)),
+                            sbi->s_inodes_per_block);
+        if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+                ext4_error(sb, "Something is wrong with group %u\n"
+                           "Used itable blocks: %d"
+                           "itable unused count: %u\n",
+                           group, used_blks,
+                           ext4_itable_unused_count(sb, gdp));
+                ret = 1;
+                goto out;
+        }
+        blk = ext4_inode_table(sb, gdp) + used_blks;
+        num = sbi->s_itb_per_group - used_blks;
+        BUFFER_TRACE(group_desc_bh, "get_write_access");
+        ret = ext4_journal_get_write_access(handle,
+                                            group_desc_bh);
+        if (ret)
+                goto err_out;
+        /*
+         * Skip zeroout if the inode table is full. But we set the ZEROED
+         * flag anyway, because obviously, when it is full it does not need
+         * further zeroing.
+         */
+        if (unlikely(num == 0))
+                goto skip_zeroout;
+        ext4_debug("going to zero out inode table in group %d\n",
+                   group);
+        ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
+        if (ret < 0)
+                goto err_out;
+        if (barrier)
+                blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+skip_zeroout:
+        ext4_lock_group(sb, group);
+        gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+        ext4_unlock_group(sb, group);
+        BUFFER_TRACE(group_desc_bh,
+                     "call ext4_handle_dirty_metadata");
+        ret = ext4_handle_dirty_metadata(handle, NULL,
+                                         group_desc_bh);
+err_out:
+        up_write(&grp->alloc_sem);
+        ext4_journal_stop(handle);
+out:
+        return ret;
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debeb3965..191616470466 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -60,6 +60,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create);
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 /*
 * Test whether an inode is a fast symlink.
@@ -755,6 +761,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                 * parent to disk.
                 */
                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+                if (unlikely(!bh)) {
+                        err = -EIO;
+                        goto failed;
+                }
                branch[n].bh = bh;
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
@@ -1207,8 +1218,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
                                break;
                        idx++;
                        num++;
-                        if (num >= max_pages)
+                        if (num >= max_pages) {
+                                done = 1;
                                break;
+                        }
                }
                pagevec_release(&pvec);
        }
@@ -1538,10 +1551,10 @@ static int do_journal_get_write_access(handle_t *handle,
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        /*
-         * __block_prepare_write() could have dirtied some buffers. Clean
+         * __block_write_begin() could have dirtied some buffers. Clean
         * the dirty bit as jbd2_journal_get_write_access() could complain
         * otherwise about fs integrity issues. Setting of the dirty bit
-         * by __block_prepare_write() isn't a real problem here as we clear
+         * by __block_write_begin() isn't a real problem here as we clear
         * the bit before releasing a page lock and thus writeback cannot
         * ever write the buffer.
         */
@@ -1995,16 +2008,23 @@ static void ext4_da_page_release_reservation(struct page *page,
 *
 * As pages are already locked by write_cache_pages(), we can't use it
 */
-static int mpage_da_submit_io(struct mpage_da_data *mpd)
+static int mpage_da_submit_io(struct mpage_da_data *mpd,
+                              struct ext4_map_blocks *map)
 {
-        long pages_skipped;
        struct pagevec pvec;
        unsigned long index, end;
        int ret = 0, err, nr_pages, i;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
+        loff_t size = i_size_read(inode);
+        unsigned int len, block_start;
+        struct buffer_head *bh, *page_bufs = NULL;
+        int journal_data = ext4_should_journal_data(inode);
+        sector_t pblock = 0, cur_logical = 0;
+        struct ext4_io_submit io_submit;
        BUG_ON(mpd->next_page <= mpd->first_page);
+        memset(&io_submit, 0, sizeof(io_submit));
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
@@ -2020,122 +2040,108 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
+                        int commit_write = 0, redirty_page = 0;
                        struct page *page = pvec.pages[i];
                        index = page->index;
                        if (index > end)
                                break;
+                        if (index == size >> PAGE_CACHE_SHIFT)
+                                len = size & ~PAGE_CACHE_MASK;
+                        else
+                                len = PAGE_CACHE_SIZE;
+                        if (map) {
+                                cur_logical = index << (PAGE_CACHE_SHIFT -
+                                                        inode->i_blkbits);
+                                pblock = map->m_pblk + (cur_logical -
+                                                        map->m_lblk);
+                        }
                        index++;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
-                        pages_skipped = mpd->wbc->pages_skipped;
-                        err = mapping->a_ops->writepage(page, mpd->wbc);
-                        if (!err && (pages_skipped == mpd->wbc->pages_skipped))
-                                /*
-                                 * have successfully written the page
-                                 * without skipping the same
-                                 */
-                                mpd->pages_written++;
                        /*
-                         * In error case, we have to continue because
+                         * If the page does not have buffers (for
-                         * remaining pages are still locked
+                         * whatever reason), try to create them using
-                         * XXX: unlock and re-dirty them?
+                         * __block_write_begin.  If this fails,
+                         * redirty the page and move on.
                         */
-                        if (ret == 0)
+                        if (!page_has_buffers(page)) {
-                                ret = err;
+                                if (__block_write_begin(page, 0, len,
-                }
+                                                noalloc_get_block_write)) {
-                pagevec_release(&pvec);
+                                redirty_page:
-        }
+                                        redirty_page_for_writepage(mpd->wbc,
-        return ret;
+                                                                   page);
-}
+                                        unlock_page(page);
+                                        continue;
-/*
+                                }
- * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+                                commit_write = 1;
- *
+                        }
- * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
- */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-                                 struct ext4_map_blocks *map)
-{
-        struct inode *inode = mpd->inode;
-        struct address_space *mapping = inode->i_mapping;
-        int blocks = map->m_len;
-        sector_t pblock = map->m_pblk, cur_logical;
-        struct buffer_head *head, *bh;
-        pgoff_t index, end;
-        struct pagevec pvec;
-        int nr_pages, i;
-        index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        pagevec_init(&pvec, 0);
-        while (index <= end) {
-                /* XXX: optimize tail */
-                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-                if (nr_pages == 0)
-                        break;
-                for (i = 0; i < nr_pages; i++) {
-                        struct page *page = pvec.pages[i];
-                        index = page->index;
-                        if (index > end)
-                                break;
-                        index++;
-                        BUG_ON(!PageLocked(page));
-                        BUG_ON(PageWriteback(page));
-                        BUG_ON(!page_has_buffers(page));
-                        bh = page_buffers(page);
-                        head = bh;
-                        /* skip blocks out of the range */
-                        do {
-                                if (cur_logical >= map->m_lblk)
-                                        break;
-                                cur_logical++;
-                        } while ((bh = bh->b_this_page) != head);
+                        bh = page_bufs = page_buffers(page);
+                        block_start = 0;
                        do {
-                                if (cur_logical >= map->m_lblk + blocks)
+                                if (!bh)
-                                        break;
+                                        goto redirty_page;
+                                if (map && (cur_logical >= map->m_lblk) &&
-                                if (buffer_delay(bh) || buffer_unwritten(bh)) {
+                                    (cur_logical <= (map->m_lblk +
+                                                     (map->m_len - 1)))) {
-                                        BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
                                        if (buffer_delay(bh)) {
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
-                                        } else {
-                                                /*
-                                                 * unwritten already should have
-                                                 * blocknr assigned. Verify that
-                                                 */
-                                                clear_buffer_unwritten(bh);
-                                                BUG_ON(bh->b_blocknr != pblock);
                                        }
+                                        if (buffer_unwritten(bh) ||
+                                            buffer_mapped(bh))
+                                                BUG_ON(bh->b_blocknr != pblock);
+                                        if (map->m_flags & EXT4_MAP_UNINIT)
+                                                set_buffer_uninit(bh);
+                                        clear_buffer_unwritten(bh);
+                                }
-                                } else if (buffer_mapped(bh))
+                                /* redirty page if block allocation undone */
-                                        BUG_ON(bh->b_blocknr != pblock);
+                                if (buffer_delay(bh) || buffer_unwritten(bh))
+                                        redirty_page = 1;
-                                if (map->m_flags & EXT4_MAP_UNINIT)
+                                bh = bh->b_this_page;
-                                        set_buffer_uninit(bh);
+                                block_start += bh->b_size;
                                cur_logical++;
                                pblock++;
-                        } while ((bh = bh->b_this_page) != head);
+                        } while (bh != page_bufs);
+                        if (redirty_page)
+                                goto redirty_page;
+                        if (commit_write)
+                                /* mark the buffer_heads as dirty & uptodate */
+                                block_commit_write(page, 0, len);
+                        /*
+                         * Delalloc doesn't support data journalling,
+                         * but eventually maybe we'll lift this
+                         * restriction.
+                         */
+                        if (unlikely(journal_data && PageChecked(page)))
+                                err = __ext4_journalled_writepage(page, len);
+                        else
+                                err = ext4_bio_write_page(&io_submit, page,
+                                                          len, mpd->wbc);
+                        if (!err)
+                                mpd->pages_written++;
+                        /*
+                         * In error case, we have to continue because
+                         * remaining pages are still locked
+                         */
+                        if (ret == 0)
+                                ret = err;
                }
                pagevec_release(&pvec);
        }
+        ext4_io_submit(&io_submit);
+        return ret;
 }
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                                        sector_t logical, long blk_cnt)
 {
@@ -2187,35 +2193,32 @@ static void ext4_print_free_blocks(struct inode *inode)
 }
 /*
- * mpage_da_map_blocks - go through given space
+ * mpage_da_map_and_submit - go through given space, map them
+ *       if necessary, and then submit them for I/O
 *
 * @mpd - bh describing space
 *
 * The function skips space we know is already mapped to disk blocks.
 *
 */
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 {
        int err, blks, get_blocks_flags;
-        struct ext4_map_blocks map;
+        struct ext4_map_blocks map, *mapp = NULL;
        sector_t next = mpd->b_blocknr;
        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
        handle_t *handle = NULL;
        /*
-         * We consider only non-mapped and non-allocated blocks
+         * If the blocks are mapped already, or we couldn't accumulate
-         */
+         * any blocks, then proceed immediately to the submission stage.
-        if ((mpd->b_state  & (1 << BH_Mapped)) &&
-                !(mpd->b_state & (1 << BH_Delay)) &&
-                !(mpd->b_state & (1 << BH_Unwritten)))
-                return 0;
-        /*
-         * If we didn't accumulate anything to write simply return
         */
-        if (!mpd->b_size)
+        if ((mpd->b_size == 0) ||
-                return 0;
+            ((mpd->b_state  & (1 << BH_Mapped)) &&
+             !(mpd->b_state & (1 << BH_Delay)) &&
+             !(mpd->b_state & (1 << BH_Unwritten))))
+                goto submit_io;
        handle = ext4_journal_current_handle();
        BUG_ON(!handle);
@@ -2252,17 +2255,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                err = blks;
                /*
-                 * If get block returns with error we simply
+                 * If get block returns EAGAIN or ENOSPC and there
-                 * return. Later writepage will redirty the page and
+                 * appears to be free blocks we will call
-                 * writepages will find the dirty page again
+                 * ext4_writepage() for all of the pages which will
+                 * just redirty the pages.
                 */
                if (err == -EAGAIN)
-                        return 0;
+                        goto submit_io;
                if (err == -ENOSPC &&
                    ext4_count_free_blocks(sb)) {
                        mpd->retval = err;
-                        return 0;
+                        goto submit_io;
                }
                /*
@@ -2287,10 +2291,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                /* invalidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
                                mpd->b_size >> mpd->inode->i_blkbits);
-                return err;
+                return;
        }
        BUG_ON(blks == 0);
+        mapp = &map;
        if (map.m_flags & EXT4_MAP_NEW) {
                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
                int i;
@@ -2299,18 +2304,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                        unmap_underlying_metadata(bdev, map.m_pblk + i);
        }
-        /*
-         * If blocks are delayed marked, we need to
-         * put actual blocknr and drop delayed bit
-         */
-        if ((mpd->b_state & (1 << BH_Delay)) ||
-            (mpd->b_state & (1 << BH_Unwritten)))
-                mpage_put_bnr_to_bhs(mpd, &map);
        if (ext4_should_order_data(mpd->inode)) {
                err = ext4_jbd2_file_inode(handle, mpd->inode);
                if (err)
-                        return err;
+                        /* This only happens if the journal is aborted */
+                        return;
        }
        /*
@@ -2321,10 +2319,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                disksize = i_size_read(mpd->inode);
        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
                ext4_update_i_disksize(mpd->inode, disksize);
-                return ext4_mark_inode_dirty(handle, mpd->inode);
+                err = ext4_mark_inode_dirty(handle, mpd->inode);
+                if (err)
+                        ext4_error(mpd->inode->i_sb,
+                                   "Failed to mark inode %lu dirty",
+                                   mpd->inode->i_ino);
        }
-        return 0;
+submit_io:
+        mpage_da_submit_io(mpd, mapp);
+        mpd->io_done = 1;
 }
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2401,9 +2405,7 @@ flush_it:
         * We couldn't merge the block to our extent, so we
         * need to flush current  extent and start new one
         */
-        if (mpage_da_map_blocks(mpd) == 0)
+        mpage_da_map_and_submit(mpd);
-                mpage_da_submit_io(mpd);
-        mpd->io_done = 1;
        return;
 }
@@ -2422,9 +2424,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 * The function finds extents of pages and scan them for all blocks.
 */
 static int __mpage_da_writepage(struct page *page,
-                                struct writeback_control *wbc, void *data)
+                                struct writeback_control *wbc,
+                                struct mpage_da_data *mpd)
 {
-        struct mpage_da_data *mpd = data;
        struct inode *inode = mpd->inode;
        struct buffer_head *bh, *head;
        sector_t logical;
@@ -2435,15 +2437,13 @@ static int __mpage_da_writepage(struct page *page,
        if (mpd->next_page != page->index) {
                /*
                 * Nope, we can't. So, we map non-allocated blocks
-                 * and start IO on them using writepage()
+                 * and start IO on them
                 */
                if (mpd->next_page != mpd->first_page) {
-                        if (mpage_da_map_blocks(mpd) == 0)
+                        mpage_da_map_and_submit(mpd);
-                                mpage_da_submit_io(mpd);
                        /*
                         * skip rest of the page in the page_vec
                         */
-                        mpd->io_done = 1;
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return MPAGE_DA_EXTENT_TAIL;
@@ -2550,8 +2550,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                if (buffer_delay(bh))
                        return 0; /* Not sure this could or should happen */
                /*
-                 * XXX: __block_prepare_write() unmaps passed block,
+                 * XXX: __block_write_begin() unmaps passed block, is it OK?
-                 * is it OK?
                 */
                ret = ext4_da_reserve_space(inode, iblock);
                if (ret)
@@ -2583,7 +2582,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 /*
 * This function is used as a standard get_block_t calback function
 * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_prepare_write() and block_write_full_page().
+ * callback function for block_write_begin() and block_write_full_page().
 * These functions should only try to map a single block at a time.
 *
 * Since this function doesn't do block allocations even if the caller
@@ -2623,6 +2622,7 @@ static int __ext4_journalled_writepage(struct page *page,
        int ret = 0;
        int err;
+        ClearPageChecked(page);
        page_bufs = page_buffers(page);
        BUG_ON(!page_bufs);
        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2700,7 +2700,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int ext4_writepage(struct page *page,
                          struct writeback_control *wbc)
 {
-        int ret = 0;
+        int ret = 0, commit_write = 0;
        loff_t size;
        unsigned int len;
        struct buffer_head *page_bufs = NULL;
@@ -2713,71 +2713,44 @@ static int ext4_writepage(struct page *page,
        else
                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
+        /*
-                page_bufs = page_buffers(page);
+         * If the page does not have buffers (for whatever reason),
-                if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+         * try to create them using __block_write_begin.  If this
-                                        ext4_bh_delay_or_unwritten)) {
+         * fails, redirty the page and move on.
-                        /*
+         */
-                         * We don't want to do  block allocation
+        if (!page_has_buffers(page)) {
-                         * So redirty the page and return
+                if (__block_write_begin(page, 0, len,
-                         * We may reach here when we do a journal commit
+                                        noalloc_get_block_write)) {
-                         * via journal_submit_inode_data_buffers.
+                redirty_page:
-                         * If we don't have mapping block we just ignore
-                         * them. We can also reach here via shrink_page_list
-                         */
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return 0;
                }
-        } else {
+                commit_write = 1;
+        }
+        page_bufs = page_buffers(page);
+        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                              ext4_bh_delay_or_unwritten)) {
                /*
-                 * The test for page_has_buffers() is subtle:
+                 * We don't want to do block allocation, so redirty
-                 * We know the page is dirty but it lost buffers. That means
+                 * the page and return.  We may reach here when we do
-                 * that at some moment in time after write_begin()/write_end()
+                 * a journal commit via journal_submit_inode_data_buffers.
-                 * has been called all buffers have been clean and thus they
+                 * We can also reach here via shrink_page_list
-                 * must have been written at least once. So they are all
-                 * mapped and we can happily proceed with mapping them
-                 * and writing the page.
-                 *
-                 * Try to initialize the buffer_heads and check whether
-                 * all are mapped and non delay. We don't want to
-                 * do block allocation here.
                 */
-                ret = block_prepare_write(page, 0, len,
+                goto redirty_page;
-                                          noalloc_get_block_write);
+        }
-                if (!ret) {
+        if (commit_write)
-                        page_bufs = page_buffers(page);
-                        /* check whether all are mapped and non delay */
-                        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                                ext4_bh_delay_or_unwritten)) {
-                                redirty_page_for_writepage(wbc, page);
-                                unlock_page(page);
-                                return 0;
-                        }
-                } else {
-                        /*
-                         * We can't do block allocation here
-                         * so just redity the page and unlock
-                         * and return
-                         */
-                        redirty_page_for_writepage(wbc, page);
-                        unlock_page(page);
-                        return 0;
-                }
                /* now mark the buffer_heads as dirty and uptodate */
                block_commit_write(page, 0, len);
-        }
-        if (PageChecked(page) && ext4_should_journal_data(inode)) {
+        if (PageChecked(page) && ext4_should_journal_data(inode))
                /*
                 * It's mmapped pagecache.  Add buffers and journal it.  There
                 * doesn't seem much point in redirtying the page here.
                 */
-                ClearPageChecked(page);
                return __ext4_journalled_writepage(page, len);
-        }
-        if (page_bufs && buffer_uninit(page_bufs)) {
+        if (buffer_uninit(page_bufs)) {
                ext4_set_bh_endio(page_bufs, inode);
                ret = block_write_full_page_endio(page, noalloc_get_block_write,
                                            wbc, ext4_end_io_buffer_write);
@@ -2824,25 +2797,32 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 */
 static int write_cache_pages_da(struct address_space *mapping,
                                struct writeback_control *wbc,
-                                struct mpage_da_data *mpd)
+                                struct mpage_da_data *mpd,
+                                pgoff_t *done_index)
 {
        int ret = 0;
        int done = 0;
        struct pagevec pvec;
-        int nr_pages;
+        unsigned nr_pages;
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        long nr_to_write = wbc->nr_to_write;
+        int tag;
        pagevec_init(&pvec, 0);
        index = wbc->range_start >> PAGE_CACHE_SHIFT;
        end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag = PAGECACHE_TAG_TOWRITE;
+        else
+                tag = PAGECACHE_TAG_DIRTY;
+        *done_index = index;
        while (!done && (index <= end)) {
                int i;
-                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-                              PAGECACHE_TAG_DIRTY,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
                        break;
@@ -2862,6 +2842,8 @@ static int write_cache_pages_da(struct address_space *mapping,
                                break;
                        }
+                        *done_index = page->index + 1;
                        lock_page(page);
                        /*
@@ -2947,6 +2929,8 @@ static int ext4_da_writepages(struct address_space *mapping,
        long desired_nr_to_write, nr_to_writebump = 0;
        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+        pgoff_t done_index = 0;
+        pgoff_t end;
        trace_ext4_da_writepages(inode, wbc);
@@ -2982,8 +2966,11 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->range_start = index << PAGE_CACHE_SHIFT;
                wbc->range_end  = LLONG_MAX;
                wbc->range_cyclic = 0;
-        } else
+                end = -1;
+        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        }
        /*
         * This works around two forms of stupidity.  The first is in
@@ -3002,9 +2989,12 @@ static int ext4_da_writepages(struct address_space *mapping,
         * sbi->max_writeback_mb_bump whichever is smaller.
         */
        max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-        if (!range_cyclic && range_whole)
+        if (!range_cyclic && range_whole) {
-                desired_nr_to_write = wbc->nr_to_write * 8;
+                if (wbc->nr_to_write == LONG_MAX)
-        else
+                        desired_nr_to_write = wbc->nr_to_write;
+                else
+                        desired_nr_to_write = wbc->nr_to_write * 8;
+        } else
                desired_nr_to_write = ext4_num_dirty_pages(inode, index,
                                                           max_pages);
        if (desired_nr_to_write > max_pages)
@@ -3021,6 +3011,9 @@ static int ext4_da_writepages(struct address_space *mapping,
        pages_skipped = wbc->pages_skipped;
 retry:
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag_pages_for_writeback(mapping, index, end);
        while (!ret && wbc->nr_to_write > 0) {
                /*
@@ -3059,16 +3052,14 @@ retry:
                mpd.io_done = 0;
                mpd.pages_written = 0;
                mpd.retval = 0;
-                ret = write_cache_pages_da(mapping, wbc, &mpd);
+                ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
                 * them for I/O.
                 */
                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-                        if (mpage_da_map_blocks(&mpd) == 0)
+                        mpage_da_map_and_submit(&mpd);
-                                mpage_da_submit_io(&mpd);
-                        mpd.io_done = 1;
                        ret = MPAGE_DA_EXTENT_TAIL;
                }
                trace_ext4_da_write_pages(inode, &mpd);
@@ -3115,14 +3106,13 @@ retry:
                         __func__, wbc->nr_to_write, ret);
        /* Update index */
-        index += pages_written;
        wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
-                mapping->writeback_index = index;
+                mapping->writeback_index = done_index;
 out_writepages:
        wbc->nr_to_write -= nr_to_writebump;
@@ -3457,15 +3447,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
-static void ext4_free_io_end(ext4_io_end_t *io)
-{
-        BUG_ON(!io);
-        if (io->page)
-                put_page(io->page);
-        iput(io->inode);
-        kfree(io);
-}
 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
 {
        struct buffer_head *head, *bh;
@@ -3642,173 +3623,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef  EXT4_DEBUG
-        struct list_head *cur, *before, *after;
-        ext4_io_end_t *io, *io0, *io1;
-        unsigned long flags;
-        if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-                ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
-                return;
-        }
-        ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-        list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
-                cur = &io->list;
-                before = cur->prev;
-                io0 = container_of(before, ext4_io_end_t, list);
-                after = cur->next;
-                io1 = container_of(after, ext4_io_end_t, list);
-                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-                            io, inode->i_ino, io0, io1);
-        }
-        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-/*
- * check a range of space and convert unwritten extents to written.
- */
-static int ext4_end_io_nolock(ext4_io_end_t *io)
-{
-        struct inode *inode = io->inode;
-        loff_t offset = io->offset;
-        ssize_t size = io->size;
-        int ret = 0;
-        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
-                   "list->prev 0x%p\n",
-                   io, inode->i_ino, io->list.next, io->list.prev);
-        if (list_empty(&io->list))
-                return ret;
-        if (io->flag != EXT4_IO_UNWRITTEN)
-                return ret;
-        ret = ext4_convert_unwritten_extents(inode, offset, size);
-        if (ret < 0) {
-                printk(KERN_EMERG "%s: failed to convert unwritten"
-                        "extents to written extents, error is %d"
-                        " io is still on inode %lu aio dio list\n",
-                       __func__, ret, inode->i_ino);
-                return ret;
-        }
-        if (io->iocb)
-                aio_complete(io->iocb, io->result, 0);
-        /* clear the DIO AIO unwritten flag */
-        io->flag = 0;
-        return ret;
-}
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
-{
-        ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
-        struct inode            *inode = io->inode;
-        struct ext4_inode_info  *ei = EXT4_I(inode);
-        unsigned long           flags;
-        int                     ret;
-        mutex_lock(&inode->i_mutex);
-        ret = ext4_end_io_nolock(io);
-        if (ret < 0) {
-                mutex_unlock(&inode->i_mutex);
-                return;
-        }
-        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        if (!list_empty(&io->list))
-                list_del_init(&io->list);
-        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-        mutex_unlock(&inode->i_mutex);
-        ext4_free_io_end(io);
-}
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int flush_completed_IO(struct inode *inode)
-{
-        ext4_io_end_t *io;
-        struct ext4_inode_info *ei = EXT4_I(inode);
-        unsigned long flags;
-        int ret = 0;
-        int ret2 = 0;
-        if (list_empty(&ei->i_completed_io_list))
-                return ret;
-        dump_completed_IO(inode);
-        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        while (!list_empty(&ei->i_completed_io_list)){
-                io = list_entry(ei->i_completed_io_list.next,
-                                ext4_io_end_t, list);
-                /*
-                 * Calling ext4_end_io_nolock() to convert completed
-                 * IO to written.
-                 *
-                 * When ext4_sync_file() is called, run_queue() may already
-                 * about to flush the work corresponding to this io structure.
-                 * It will be upset if it founds the io structure related
-                 * to the work-to-be schedule is freed.
-                 *
-                 * Thus we need to keep the io structure still valid here after
-                 * convertion finished. The io structure has a flag to
-                 * avoid double converting from both fsync and background work
-                 * queue work.
-                 */
-                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-                ret = ext4_end_io_nolock(io);
-                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-                if (ret < 0)
-                        ret2 = ret;
-                else
-                        list_del_init(&io->list);
-        }
-        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-        return (ret2 < 0) ? ret2 : 0;
-}
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
-{
-        ext4_io_end_t *io = NULL;
-        io = kmalloc(sizeof(*io), flags);
-        if (io) {
-                igrab(inode);
-                io->inode = inode;
-                io->flag = 0;
-                io->offset = 0;
-                io->size = 0;
-                io->page = NULL;
-                io->iocb = NULL;
-                io->result = 0;
-                INIT_WORK(&io->work, ext4_end_io_work);
-                INIT_LIST_HEAD(&io->list);
-        }
-        return io;
-}
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                            ssize_t size, void *private, int ret,
                            bool is_async)
@@ -3828,7 +3642,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  size);
        /* if not aio dio with unwritten extents, just free io and return */
-        if (io_end->flag != EXT4_IO_UNWRITTEN){
+        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                ext4_free_io_end(io_end);
                iocb->private = NULL;
 out:
@@ -3845,14 +3659,14 @@ out:
        }
        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-        /* queue the work to convert unwritten extents to written */
-        queue_work(wq, &io_end->work);
        /* Add the io_end to per-inode completed aio dio list*/
        ei = EXT4_I(io_end->inode);
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        list_add_tail(&io_end->list, &ei->i_completed_io_list);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        /* queue the work to convert unwritten extents to written */
+        queue_work(wq, &io_end->work);
        iocb->private = NULL;
 }
@@ -3873,7 +3687,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
                goto out;
        }
-        io_end->flag = EXT4_IO_UNWRITTEN;
+        io_end->flag = EXT4_IO_END_UNWRITTEN;
        inode = io_end->inode;
        /* Add the io_end to per-inode completed io list*/
@@ -5464,6 +5278,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        int error, rc = 0;
+        int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;
        error = inode_change_ok(inode, attr);
@@ -5519,8 +5334,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        error = PTR_ERR(handle);
                        goto err_out;
                }
+                if (ext4_handle_valid(handle)) {
-                error = ext4_orphan_add(handle, inode);
+                        error = ext4_orphan_add(handle, inode);
+                        orphan = 1;
+                }
                EXT4_I(inode)->i_disksize = attr->ia_size;
                rc = ext4_mark_inode_dirty(handle, inode);
                if (!error)
@@ -5538,6 +5355,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                        goto err_out;
                                }
                                ext4_orphan_del(handle, inode);
+                                orphan = 0;
                                ext4_journal_stop(handle);
                                goto err_out;
                        }
@@ -5560,7 +5378,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
         * If the call to ext4_truncate failed to get a transaction handle at
         * all, we need to clean up the in-core orphan list manually.
         */
-        if (inode->i_nlink)
+        if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);
        if (!rc && (ia_valid & ATTR_MODE))
@@ -5643,7 +5461,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 *
 * Also account for superblock, inode, quota and xattr blocks
 */
-int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 19aa0d44d822..c58eba34724a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -338,6 +338,14 @@
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_ext_cachep;
+/* We create slab caches for groupinfo data structures based on the
+ * superblock block size.  There will be one per mounted filesystem for
+ * each unique s_blocksize_bits */
+#define NR_GRPINFO_CACHES       \
+        (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
+static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -939,6 +947,85 @@ out:
 }
 /*
+ * lock the group_info alloc_sem of all the groups
+ * belonging to the same buddy cache page. This
+ * make sure other parallel operation on the buddy
+ * cache doesn't happen  whild holding the buddy cache
+ * lock
+ */
+static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
+                                        ext4_group_t group)
+{
+        int i;
+        int block, pnum;
+        int blocks_per_page;
+        int groups_per_page;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
+        ext4_group_t first_group;
+        struct ext4_group_info *grp;
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        first_group = pnum * blocks_per_page / 2;
+        groups_per_page = blocks_per_page >> 1;
+        if (groups_per_page == 0)
+                groups_per_page = 1;
+        /* read all groups the page covers into the cache */
+        for (i = 0; i < groups_per_page; i++) {
+                if ((first_group + i) >= ngroups)
+                        break;
+                grp = ext4_get_group_info(sb, first_group + i);
+                /* take all groups write allocation
+                 * semaphore. This make sure there is
+                 * no block allocation going on in any
+                 * of that groups
+                 */
+                down_write_nested(&grp->alloc_sem, i);
+        }
+        return i;
+}
+static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+                                         ext4_group_t group, int locked_group)
+{
+        int i;
+        int block, pnum;
+        int blocks_per_page;
+        ext4_group_t first_group;
+        struct ext4_group_info *grp;
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        first_group = pnum * blocks_per_page / 2;
+        /* release locks on all the groups */
+        for (i = 0; i < locked_group; i++) {
+                grp = ext4_get_group_info(sb, first_group + i);
+                /* take all groups write allocation
+                 * semaphore. This make sure there is
+                 * no block allocation going on in any
+                 * of that groups
+                 */
+                up_write(&grp->alloc_sem);
+        }
+}
+/*
 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
 * block group lock of all groups for this page; do not hold the BG lock when
 * calling this routine!
@@ -1915,84 +2002,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        return 0;
 }
-/*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen  whild holding the buddy cache
- * lock
- */
-int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
-{
-        int i;
-        int block, pnum;
-        int blocks_per_page;
-        int groups_per_page;
-        ext4_group_t ngroups = ext4_get_groups_count(sb);
-        ext4_group_t first_group;
-        struct ext4_group_info *grp;
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-        /*
-         * the buddy cache inode stores the block bitmap
-         * and buddy information in consecutive blocks.
-         * So for each group we need two blocks.
-         */
-        block = group * 2;
-        pnum = block / blocks_per_page;
-        first_group = pnum * blocks_per_page / 2;
-        groups_per_page = blocks_per_page >> 1;
-        if (groups_per_page == 0)
-                groups_per_page = 1;
-        /* read all groups the page covers into the cache */
-        for (i = 0; i < groups_per_page; i++) {
-                if ((first_group + i) >= ngroups)
-                        break;
-                grp = ext4_get_group_info(sb, first_group + i);
-                /* take all groups write allocation
-                 * semaphore. This make sure there is
-                 * no block allocation going on in any
-                 * of that groups
-                 */
-                down_write_nested(&grp->alloc_sem, i);
-        }
-        return i;
-}
-void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
-                                        ext4_group_t group, int locked_group)
-{
-        int i;
-        int block, pnum;
-        int blocks_per_page;
-        ext4_group_t first_group;
-        struct ext4_group_info *grp;
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-        /*
-         * the buddy cache inode stores the block bitmap
-         * and buddy information in consecutive blocks.
-         * So for each group we need two blocks.
-         */
-        block = group * 2;
-        pnum = block / blocks_per_page;
-        first_group = pnum * blocks_per_page / 2;
-        /* release locks on all the groups */
-        for (i = 0; i < locked_group; i++) {
-                grp = ext4_get_group_info(sb, first_group + i);
-                /* take all groups write allocation
-                 * semaphore. This make sure there is
-                 * no block allocation going on in any
-                 * of that groups
-                 */
-                up_write(&grp->alloc_sem);
-        }
-}
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -2233,15 +2242,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
        .release        = seq_release,
 };
+static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+{
+        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+        BUG_ON(!cachep);
+        return cachep;
+}
 /* Create and initialize ext4_group_info data for the given group. */
 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                          struct ext4_group_desc *desc)
 {
-        int i, len;
+        int i;
        int metalen = 0;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info **meta_group_info;
+        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        /*
         * First check if this group is the first of a reserved block.
@@ -2261,22 +2279,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                        meta_group_info;
        }
-        /*
-         * calculate needed size. if change bb_counters size,
-         * don't forget about ext4_mb_generate_buddy()
-         */
-        len = offsetof(typeof(**meta_group_info),
-                       bb_counters[sb->s_blocksize_bits + 2]);
        meta_group_info =
                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
-        meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+        meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
        if (meta_group_info[i] == NULL) {
                printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
                goto exit_group_info;
        }
+        memset(meta_group_info[i], 0, kmem_cache_size(cachep));
        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
                &(meta_group_info[i]->bb_state));
@@ -2331,6 +2343,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        int num_meta_group_infos_max;
        int array_size;
        struct ext4_group_desc *desc;
+        struct kmem_cache *cachep;
        /* This is the number of blocks used by GDT */
        num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
@@ -2373,6 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
                printk(KERN_ERR "EXT4-fs: can't get new inode\n");
                goto err_freesgi;
        }
+        sbi->s_buddy_cache->i_ino = get_next_ino();
        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
        for (i = 0; i < ngroups; i++) {
                desc = ext4_get_group_desc(sb, i, NULL);
@@ -2388,8 +2402,9 @@ static int ext4_mb_init_backend(struct super_block *sb)
        return 0;
 err_freebuddy:
+        cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        while (i-- > 0)
-                kfree(ext4_get_group_info(sb, i));
+                kmem_cache_free(cachep, ext4_get_group_info(sb, i));
        i = num_meta_group_infos;
        while (i-- > 0)
                kfree(sbi->s_group_info[i]);
@@ -2406,19 +2421,48 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        unsigned offset;
        unsigned max;
        int ret;
+        int cache_index;
+        struct kmem_cache *cachep;
+        char *namep = NULL;
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_offsets == NULL) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto out;
        }
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
-                kfree(sbi->s_mb_offsets);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto out;
+        }
+        cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        cachep = ext4_groupinfo_caches[cache_index];
+        if (!cachep) {
+                char name[32];
+                int len = offsetof(struct ext4_group_info,
+                                        bb_counters[sb->s_blocksize_bits + 2]);
+                sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
+                namep = kstrdup(name, GFP_KERNEL);
+                if (!namep) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                /* Need to free the kmem_cache_name() when we
+                 * destroy the slab */
+                cachep = kmem_cache_create(namep, len, 0,
+                                             SLAB_RECLAIM_ACCOUNT, NULL);
+                if (!cachep) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ext4_groupinfo_caches[cache_index] = cachep;
        }
        /* order 0 is regular bitmap */
@@ -2439,9 +2483,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        /* init file for buddy data */
        ret = ext4_mb_init_backend(sb);
        if (ret != 0) {
-                kfree(sbi->s_mb_offsets);
+                goto out;
-                kfree(sbi->s_mb_maxs);
-                return ret;
        }
        spin_lock_init(&sbi->s_md_lock);
@@ -2456,9 +2498,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
        if (sbi->s_locality_groups == NULL) {
-                kfree(sbi->s_mb_offsets);
+                ret = -ENOMEM;
-                kfree(sbi->s_mb_maxs);
+                goto out;
-                return -ENOMEM;
        }
        for_each_possible_cpu(i) {
                struct ext4_locality_group *lg;
@@ -2475,7 +2516,13 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        if (sbi->s_journal)
                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-        return 0;
+out:
+        if (ret) {
+                kfree(sbi->s_mb_offsets);
+                kfree(sbi->s_mb_maxs);
+                kfree(namep);
+        }
+        return ret;
 }
 /* need to called with the ext4 group lock held */
@@ -2503,6 +2550,7 @@ int ext4_mb_release(struct super_block *sb)
        int num_meta_group_infos;
        struct ext4_group_info *grinfo;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        if (sbi->s_group_info) {
                for (i = 0; i < ngroups; i++) {
@@ -2513,7 +2561,7 @@ int ext4_mb_release(struct super_block *sb)
                        ext4_lock_group(sb, i);
                        ext4_mb_cleanup_pa(grinfo);
                        ext4_unlock_group(sb, i);
-                        kfree(grinfo);
+                        kmem_cache_free(cachep, grinfo);
                }
                num_meta_group_infos = (ngroups +
                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
@@ -2557,7 +2605,7 @@ int ext4_mb_release(struct super_block *sb)
        return 0;
 }
-static inline void ext4_issue_discard(struct super_block *sb,
+static inline int ext4_issue_discard(struct super_block *sb,
                ext4_group_t block_group, ext4_grpblk_t block, int count)
 {
        int ret;
@@ -2567,10 +2615,11 @@ static inline void ext4_issue_discard(struct super_block *sb,
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);
        ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
-        if (ret == EOPNOTSUPP) {
+        if (ret == -EOPNOTSUPP) {
                ext4_warning(sb, "discard not supported, disabling");
                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
        }
+        return ret;
 }
 /*
@@ -2658,28 +2707,22 @@ static void ext4_remove_debugfs_entry(void)
 #endif
-int __init init_ext4_mballoc(void)
+int __init ext4_init_mballoc(void)
 {
-        ext4_pspace_cachep =
+        ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
-                kmem_cache_create("ext4_prealloc_space",
+                                        SLAB_RECLAIM_ACCOUNT);
-                                     sizeof(struct ext4_prealloc_space),
-                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
        if (ext4_pspace_cachep == NULL)
                return -ENOMEM;
-        ext4_ac_cachep =
+        ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
-                kmem_cache_create("ext4_alloc_context",
+                                    SLAB_RECLAIM_ACCOUNT);
-                                     sizeof(struct ext4_allocation_context),
-                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
        if (ext4_ac_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                return -ENOMEM;
        }
-        ext4_free_ext_cachep =
+        ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
-                kmem_cache_create("ext4_free_block_extents",
+                                          SLAB_RECLAIM_ACCOUNT);
-                                     sizeof(struct ext4_free_data),
-                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
        if (ext4_free_ext_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                kmem_cache_destroy(ext4_ac_cachep);
@@ -2689,8 +2732,9 @@ int __init init_ext4_mballoc(void)
        return 0;
 }
-void exit_ext4_mballoc(void)
+void ext4_exit_mballoc(void)
 {
+        int i;
        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
@@ -2699,6 +2743,15 @@ void exit_ext4_mballoc(void)
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_ext_cachep);
+        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+                struct kmem_cache *cachep = ext4_groupinfo_caches[i];
+                if (cachep) {
+                        char *name = (char *)kmem_cache_name(cachep);
+                        kmem_cache_destroy(cachep);
+                        kfree(name);
+                }
+        }
        ext4_remove_debugfs_entry();
 }
@@ -3535,8 +3588,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
 */
 static noinline_for_stack int
 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
-                        struct ext4_prealloc_space *pa,
+                        struct ext4_prealloc_space *pa)
-                        struct ext4_allocation_context *ac)
 {
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3554,11 +3606,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;
-        if (ac) {
-                ac->ac_sb = sb;
-                ac->ac_inode = pa->pa_inode;
-        }
        while (bit < end) {
                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
                if (bit >= end)
@@ -3569,16 +3616,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                         (unsigned) next - bit, (unsigned) group);
                free += next - bit;
-                if (ac) {
+                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
-                        ac->ac_b_ex.fe_group = group;
+                trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
-                        ac->ac_b_ex.fe_start = bit;
+                                               grp_blk_start + bit, next - bit);
-                        ac->ac_b_ex.fe_len = next - bit;
-                        ac->ac_b_ex.fe_logical = 0;
-                        trace_ext4_mballoc_discard(ac);
-                }
-                trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
-                                               next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
@@ -3601,29 +3641,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 static noinline_for_stack int
 ext4_mb_release_group_pa(struct ext4_buddy *e4b,
-                                struct ext4_prealloc_space *pa,
+                                struct ext4_prealloc_space *pa)
-                                struct ext4_allocation_context *ac)
 {
        struct super_block *sb = e4b->bd_sb;
        ext4_group_t group;
        ext4_grpblk_t bit;
-        trace_ext4_mb_release_group_pa(sb, ac, pa);
+        trace_ext4_mb_release_group_pa(sb, pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
+        trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
-        if (ac) {
-                ac->ac_sb = sb;
-                ac->ac_inode = NULL;
-                ac->ac_b_ex.fe_group = group;
-                ac->ac_b_ex.fe_start = bit;
-                ac->ac_b_ex.fe_len = pa->pa_len;
-                ac->ac_b_ex.fe_logical = 0;
-                trace_ext4_mballoc_discard(ac);
-        }
        return 0;
 }
@@ -3644,7 +3674,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
-        struct ext4_allocation_context *ac;
        struct list_head list;
        struct ext4_buddy e4b;
        int err;
@@ -3673,9 +3702,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
                needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
        INIT_LIST_HEAD(&list);
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac)
-                ac->ac_sb = sb;
 repeat:
        ext4_lock_group(sb, group);
        list_for_each_entry_safe(pa, tmp,
@@ -3730,9 +3756,9 @@ repeat:
                spin_unlock(pa->pa_obj_lock);
                if (pa->pa_type == MB_GROUP_PA)
-                        ext4_mb_release_group_pa(&e4b, pa, ac);
+                        ext4_mb_release_group_pa(&e4b, pa);
                else
-                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3740,8 +3766,6 @@ repeat:
 out:
        ext4_unlock_group(sb, group);
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
        ext4_mb_unload_buddy(&e4b);
        put_bh(bitmap_bh);
        return free;
@@ -3762,7 +3786,6 @@ void ext4_discard_preallocations(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
-        struct ext4_allocation_context *ac;
        ext4_group_t group = 0;
        struct list_head list;
        struct ext4_buddy e4b;
@@ -3778,11 +3801,6 @@ void ext4_discard_preallocations(struct inode *inode)
        INIT_LIST_HEAD(&list);
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac) {
-                ac->ac_sb = sb;
-                ac->ac_inode = inode;
-        }
 repeat:
        /* first, collect all pa's in the inode */
        spin_lock(&ei->i_prealloc_lock);
@@ -3852,7 +3870,7 @@ repeat:
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
-                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                ext4_unlock_group(sb, group);
                ext4_mb_unload_buddy(&e4b);
@@ -3861,8 +3879,6 @@ repeat:
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
 }
 /*
@@ -4060,14 +4076,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
        struct ext4_buddy e4b;
        struct list_head discard_list;
        struct ext4_prealloc_space *pa, *tmp;
-        struct ext4_allocation_context *ac;
        mb_debug(1, "discard locality group preallocation\n");
        INIT_LIST_HEAD(&discard_list);
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac)
-                ac->ac_sb = sb;
        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4119,15 +4131,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                }
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
-                ext4_mb_release_group_pa(&e4b, pa, ac);
+                ext4_mb_release_group_pa(&e4b, pa);
                ext4_unlock_group(sb, group);
                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
 }
 /*
@@ -4491,7 +4501,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 {
        struct buffer_head *bitmap_bh = NULL;
        struct super_block *sb = inode->i_sb;
-        struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
        unsigned long freed = 0;
        unsigned int overflow;
@@ -4531,6 +4540,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        if (!bh)
                                tbh = sb_find_get_block(inode->i_sb,
                                                        block + i);
+                        if (unlikely(!tbh))
+                                continue;
                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                    inode, tbh, block + i);
                }
@@ -4546,12 +4557,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        if (!ext4_should_writeback_data(inode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac) {
-                ac->ac_inode = inode;
-                ac->ac_sb = sb;
-        }
 do_more:
        overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4609,12 +4614,7 @@ do_more:
                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
        }
 #endif
-        if (ac) {
+        trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
-                ac->ac_b_ex.fe_group = block_group;
-                ac->ac_b_ex.fe_start = bit;
-                ac->ac_b_ex.fe_len = count;
-                trace_ext4_mballoc_free(ac);
-        }
        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
@@ -4640,12 +4640,12 @@ do_more:
                 * with group lock held. generate_buddy look at
                 * them with group lock_held
                 */
+                if (test_opt(sb, DISCARD))
+                        ext4_issue_discard(sb, block_group, bit, count);
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-                if (test_opt(sb, DISCARD))
-                        ext4_issue_discard(sb, block_group, bit, count);
        }
        ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4685,7 +4685,190 @@ error_return:
                dquot_free_block(inode, freed);
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
        return;
 }
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb:         super block for the file system
+ * @start:      starting block of the free extent in the alloc. group
+ * @count:      number of blocks to TRIM
+ * @group:      alloc. group we are working with
+ * @e4b:        ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
+                ext4_group_t group, struct ext4_buddy *e4b)
+{
+        struct ext4_free_extent ex;
+        int ret = 0;
+        assert_spin_locked(ext4_group_lock_ptr(sb, group));
+        ex.fe_start = start;
+        ex.fe_group = group;
+        ex.fe_len = count;
+        /*
+         * Mark blocks used, so no one can reuse them while
+         * being trimmed.
+         */
+        mb_mark_used(e4b, &ex);
+        ext4_unlock_group(sb, group);
+        ret = ext4_issue_discard(sb, group, start, count);
+        if (ret)
+                ext4_std_error(sb, ret);
+        ext4_lock_group(sb, group);
+        mb_free_blocks(NULL, e4b, start, ex.fe_len);
+        return ret;
+}
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:                 super block for file system
+ * @e4b:                ext4 buddy
+ * @start:              first group block to examine
+ * @max:                last group block to examine
+ * @minblocks:          minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+                ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+{
+        void *bitmap;
+        ext4_grpblk_t next, count = 0;
+        ext4_group_t group;
+        int ret = 0;
+        BUG_ON(e4b == NULL);
+        bitmap = e4b->bd_bitmap;
+        group = e4b->bd_group;
+        start = (e4b->bd_info->bb_first_free > start) ?
+                e4b->bd_info->bb_first_free : start;
+        ext4_lock_group(sb, group);
+        while (start < max) {
+                start = mb_find_next_zero_bit(bitmap, max, start);
+                if (start >= max)
+                        break;
+                next = mb_find_next_bit(bitmap, max, start);
+                if ((next - start) >= minblocks) {
+                        ret = ext4_trim_extent(sb, start,
+                                next - start, group, e4b);
+                        if (ret < 0)
+                                break;
+                        count += next - start;
+                }
+                start = next + 1;
+                if (fatal_signal_pending(current)) {
+                        count = -ERESTARTSYS;
+                        break;
+                }
+                if (need_resched()) {
+                        ext4_unlock_group(sb, group);
+                        cond_resched();
+                        ext4_lock_group(sb, group);
+                }
+                if ((e4b->bd_info->bb_free - count) < minblocks)
+                        break;
+        }
+        ext4_unlock_group(sb, group);
+        ext4_debug("trimmed %d blocks in the group %d\n",
+                count, group);
+        if (ret < 0)
+                count = ret;
+        return count;
+}
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb:                 superblock for filesystem
+ * @range:              fstrim_range structure
+ *
+ * start:       First Byte to trim
+ * len:         number of Bytes to trim from start
+ * minlen:      minimum extent length in Bytes
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+        struct ext4_buddy e4b;
+        ext4_group_t first_group, last_group;
+        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+        ext4_grpblk_t cnt = 0, first_block, last_block;
+        uint64_t start, len, minlen, trimmed;
+        int ret = 0;
+        start = range->start >> sb->s_blocksize_bits;
+        len = range->len >> sb->s_blocksize_bits;
+        minlen = range->minlen >> sb->s_blocksize_bits;
+        trimmed = 0;
+        if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+                return -EINVAL;
+        /* Determine first and last group to examine based on start and len */
+        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+                                     &first_group, &first_block);
+        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+                                     &last_group, &last_block);
+        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+        last_block = EXT4_BLOCKS_PER_GROUP(sb);
+        if (first_group > last_group)
+                return -EINVAL;
+        for (group = first_group; group <= last_group; group++) {
+                ret = ext4_mb_load_buddy(sb, group, &e4b);
+                if (ret) {
+                        ext4_error(sb, "Error in loading buddy "
+                                        "information for %u", group);
+                        break;
+                }
+                if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+                        len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+                else
+                        last_block = len;
+                if (e4b.bd_info->bb_free >= minlen) {
+                        cnt = ext4_trim_all_free(sb, &e4b, first_block,
+                                                last_block, minlen);
+                        if (cnt < 0) {
+                                ret = cnt;
+                                ext4_mb_unload_buddy(&e4b);
+                                break;
+                        }
+                }
+                ext4_mb_unload_buddy(&e4b);
+                trimmed += cnt;
+                first_block = 0;
+        }
+        range->len = trimmed * sb->s_blocksize;
+        return ret;
+}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 1765c2c50a9b..25f3a974b725 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -412,7 +412,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
        struct buffer_head *bh;
        struct ext4_extent_header *eh;
-        block = idx_pblock(ix);
+        block = ext4_idx_pblock(ix);
        bh = sb_bread(inode->i_sb, block);
        if (!bh)
                return -EIO;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5f1ed9fc913c..b9f3e7862f13 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -85,7 +85,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
        if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
                /* leaf block */
                *extent = ++path[ppos].p_ext;
-                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
                return 0;
        }
@@ -96,7 +96,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                        /* index block */
                        path[ppos].p_idx++;
-                        path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+                        path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                        if (path[ppos+1].p_bh)
                                brelse(path[ppos+1].p_bh);
                        path[ppos+1].p_bh =
@@ -111,7 +111,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                                path[cur_ppos].p_idx =
                                        EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
                                path[cur_ppos].p_block =
-                                        idx_pblock(path[cur_ppos].p_idx);
+                                        ext4_idx_pblock(path[cur_ppos].p_idx);
                                if (path[cur_ppos+1].p_bh)
                                        brelse(path[cur_ppos+1].p_bh);
                                path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
@@ -133,7 +133,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                        path[leaf_ppos].p_ext = *extent =
                                EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
                        path[leaf_ppos].p_block =
-                                        ext_pblock(path[leaf_ppos].p_ext);
+                                        ext4_ext_pblock(path[leaf_ppos].p_ext);
                        return 0;
                }
        }
@@ -249,7 +249,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                         */
                        o_end->ee_block = end_ext->ee_block;
                        o_end->ee_len = end_ext->ee_len;
-                        ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+                        ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
                }
                o_start->ee_len = start_ext->ee_len;
@@ -276,7 +276,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                 */
                o_end->ee_block = end_ext->ee_block;
                o_end->ee_len = end_ext->ee_len;
-                ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+                ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
                /*
                 * Set 0 to the extent block if new_ext was
@@ -361,7 +361,7 @@ mext_insert_inside_block(struct ext4_extent *o_start,
        /* Insert new entry */
        if (new_ext->ee_len) {
                o_start[i] = *new_ext;
-                ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+                ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
        }
        /* Insert end entry */
@@ -488,7 +488,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        start_ext.ee_len = end_ext.ee_len = 0;
        new_ext.ee_block = cpu_to_le32(*from);
-        ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+        ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
        new_ext.ee_len = dext->ee_len;
        new_ext_alen = ext4_ext_get_actual_len(&new_ext);
        new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
@@ -553,7 +553,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
                copy_extent_status(oext, &end_ext);
                end_ext_alen = ext4_ext_get_actual_len(&end_ext);
                ext4_ext_store_pblock(&end_ext,
-                        (ext_pblock(o_end) + oext_alen - end_ext_alen));
+                        (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
                end_ext.ee_block =
                        cpu_to_le32(le32_to_cpu(o_end->ee_block) +
                        oext_alen - end_ext_alen);
@@ -604,7 +604,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
        /* When tmp_dext is too large, pick up the target range. */
        diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
-        ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
+        ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
        tmp_dext->ee_block =
                        cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
        tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
@@ -613,7 +613,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
                tmp_dext->ee_len = cpu_to_le16(max_count);
        orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
-        ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
+        ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
        /* Adjust extent length if donor extent is larger than orig */
        if (ext4_ext_get_actual_len(tmp_dext) >
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 314c0d3b3fa9..92203b8a099f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -856,6 +856,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        struct buffer_head *bh_use[NAMEI_RA_SIZE];
        struct buffer_head *bh, *ret = NULL;
        ext4_lblk_t start, block, b;
+        const u8 *name = d_name->name;
        int ra_max = 0;         /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        int ra_ptr = 0;         /* Current index into readahead
@@ -870,6 +871,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        namelen = d_name->len;
        if (namelen > EXT4_NAME_LEN)
                return NULL;
+        if ((namelen <= 2) && (name[0] == '.') &&
+            (name[1] == '.' || name[1] == '0')) {
+                /*
+                 * "." or ".." will only be in the first block
+                 * NFS may look up ".."; "." should be handled by the VFS
+                 */
+                block = start = 0;
+                nblocks = 1;
+                goto restart;
+        }
        if (is_dx(dir)) {
                bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
                /*
@@ -960,55 +971,35 @@ cleanup_and_exit:
 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
                       struct ext4_dir_entry_2 **res_dir, int *err)
 {
-        struct super_block * sb;
+        struct super_block * sb = dir->i_sb;
        struct dx_hash_info     hinfo;
-        u32 hash;
        struct dx_frame frames[2], *frame;
-        struct ext4_dir_entry_2 *de, *top;
        struct buffer_head *bh;
        ext4_lblk_t block;
        int retval;
-        int namelen = d_name->len;
-        const u8 *name = d_name->name;
-        sb = dir->i_sb;
+        if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-        /* NFS may look up ".." - look at dx_root directory block */
+                return NULL;
-        if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-                if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-                        return NULL;
-        } else {
-                frame = frames;
-                frame->bh = NULL;                       /* for dx_release() */
-                frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
-                dx_set_block(frame->at, 0);             /* dx_root block is 0 */
-        }
-        hash = hinfo.hash;
        do {
                block = dx_get_block(frame->at);
-                if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+                if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
                        goto errout;
-                de = (struct ext4_dir_entry_2 *) bh->b_data;
-                top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-                                       EXT4_DIR_REC_LEN(0));
-                for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
-                        int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
-                                  + ((char *) de - bh->b_data);
-                        if (!ext4_check_dir_entry(dir, de, bh, off)) {
-                                brelse(bh);
-                                *err = ERR_BAD_DX_DIR;
-                                goto errout;
-                        }
-                        if (ext4_match(namelen, name, de)) {
+                retval = search_dirblock(bh, dir, d_name,
-                                *res_dir = de;
+                                         block << EXT4_BLOCK_SIZE_BITS(sb),
-                                dx_release(frames);
+                                         res_dir);
-                                return bh;
+                if (retval == 1) {      /* Success! */
-                        }
+                        dx_release(frames);
+                        return bh;
                }
                brelse(bh);
+                if (retval == -1) {
+                        *err = ERR_BAD_DX_DIR;
+                        goto errout;
+                }
                /* Check to see if we should continue to search */
-                retval = ext4_htree_next_block(dir, hash, frame,
+                retval = ext4_htree_next_block(dir, hinfo.hash, frame,
                                               frames, NULL);
                if (retval < 0) {
                        ext4_warning(sb,
@@ -2312,7 +2303,7 @@ retry:
        inode->i_ctime = ext4_current_time(inode);
        ext4_inc_count(handle, inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = ext4_add_entry(handle, dentry, inode);
        if (!err) {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 000000000000..46a7d6a9d976
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,430 @@
+/*
+ * linux/fs/ext4/page-io.c
+ *
+ * This contains the new page_io functions for ext4
+ *
+ * Written by Theodore Ts'o, 2010.
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "ext4_extents.h"
+static struct kmem_cache *io_page_cachep, *io_end_cachep;
+int __init ext4_init_pageio(void)
+{
+        io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
+        if (io_page_cachep == NULL)
+                return -ENOMEM;
+        io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
+        if (io_page_cachep == NULL) {
+                kmem_cache_destroy(io_page_cachep);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void ext4_exit_pageio(void)
+{
+        kmem_cache_destroy(io_end_cachep);
+        kmem_cache_destroy(io_page_cachep);
+}
+void ext4_free_io_end(ext4_io_end_t *io)
+{
+        int i;
+        BUG_ON(!io);
+        if (io->page)
+                put_page(io->page);
+        for (i = 0; i < io->num_io_pages; i++) {
+                if (--io->pages[i]->p_count == 0) {
+                        struct page *page = io->pages[i]->p_page;
+                        end_page_writeback(page);
+                        put_page(page);
+                        kmem_cache_free(io_page_cachep, io->pages[i]);
+                }
+        }
+        io->num_io_pages = 0;
+        iput(io->inode);
+        kmem_cache_free(io_end_cachep, io);
+}
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+int ext4_end_io_nolock(ext4_io_end_t *io)
+{
+        struct inode *inode = io->inode;
+        loff_t offset = io->offset;
+        ssize_t size = io->size;
+        int ret = 0;
+        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+                   "list->prev 0x%p\n",
+                   io, inode->i_ino, io->list.next, io->list.prev);
+        if (list_empty(&io->list))
+                return ret;
+        if (!(io->flag & EXT4_IO_END_UNWRITTEN))
+                return ret;
+        ret = ext4_convert_unwritten_extents(inode, offset, size);
+        if (ret < 0) {
+                printk(KERN_EMERG "%s: failed to convert unwritten "
+                        "extents to written extents, error is %d "
+                        "io is still on inode %lu aio dio list\n",
+                       __func__, ret, inode->i_ino);
+                return ret;
+        }
+        if (io->iocb)
+                aio_complete(io->iocb, io->result, 0);
+        /* clear the DIO AIO unwritten flag */
+        io->flag &= ~EXT4_IO_END_UNWRITTEN;
+        return ret;
+}
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+        ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
+        struct inode            *inode = io->inode;
+        struct ext4_inode_info  *ei = EXT4_I(inode);
+        unsigned long           flags;
+        int                     ret;
+        mutex_lock(&inode->i_mutex);
+        ret = ext4_end_io_nolock(io);
+        if (ret < 0) {
+                mutex_unlock(&inode->i_mutex);
+                return;
+        }
+        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        if (!list_empty(&io->list))
+                list_del_init(&io->list);
+        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        mutex_unlock(&inode->i_mutex);
+        ext4_free_io_end(io);
+}
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+        ext4_io_end_t *io = NULL;
+        io = kmem_cache_alloc(io_end_cachep, flags);
+        if (io) {
+                memset(io, 0, sizeof(*io));
+                io->inode = igrab(inode);
+                BUG_ON(!io->inode);
+                INIT_WORK(&io->work, ext4_end_io_work);
+                INIT_LIST_HEAD(&io->list);
+        }
+        return io;
+}
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+        char b[BDEVNAME_SIZE];
+        printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+                        bdevname(bh->b_bdev, b),
+                        (unsigned long long)bh->b_blocknr);
+}
+static void ext4_end_bio(struct bio *bio, int error)
+{
+        ext4_io_end_t *io_end = bio->bi_private;
+        struct workqueue_struct *wq;
+        struct inode *inode;
+        unsigned long flags;
+        ext4_fsblk_t err_block;
+        int i;
+        BUG_ON(!io_end);
+        inode = io_end->inode;
+        bio->bi_private = NULL;
+        bio->bi_end_io = NULL;
+        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+                error = 0;
+        err_block = bio->bi_sector >> (inode->i_blkbits - 9);
+        bio_put(bio);
+        if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
+                pr_err("sb umounted, discard end_io request for inode %lu\n",
+                        io_end->inode->i_ino);
+                ext4_free_io_end(io_end);
+                return;
+        }
+        if (error) {
+                io_end->flag |= EXT4_IO_END_ERROR;
+                ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+                             "(offset %llu size %ld starting block %llu)",
+                             inode->i_ino,
+                             (unsigned long long) io_end->offset,
+                             (long) io_end->size,
+                             (unsigned long long) err_block);
+        }
+        for (i = 0; i < io_end->num_io_pages; i++) {
+                struct page *page = io_end->pages[i]->p_page;
+                struct buffer_head *bh, *head;
+                int partial_write = 0;
+                head = page_buffers(page);
+                if (error)
+                        SetPageError(page);
+                BUG_ON(!head);
+                if (head->b_size == PAGE_CACHE_SIZE)
+                        clear_buffer_dirty(head);
+                else {
+                        loff_t offset;
+                        loff_t io_end_offset = io_end->offset + io_end->size;
+                        offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
+                        bh = head;
+                        do {
+                                if ((offset >= io_end->offset) &&
+                                    (offset+bh->b_size <= io_end_offset)) {
+                                        if (error)
+                                                buffer_io_error(bh);
+                                        clear_buffer_dirty(bh);
+                                }
+                                if (buffer_delay(bh))
+                                        partial_write = 1;
+                                else if (!buffer_mapped(bh))
+                                        clear_buffer_dirty(bh);
+                                else if (buffer_dirty(bh))
+                                        partial_write = 1;
+                                offset += bh->b_size;
+                                bh = bh->b_this_page;
+                        } while (bh != head);
+                }
+                if (--io_end->pages[i]->p_count == 0) {
+                        struct page *page = io_end->pages[i]->p_page;
+                        end_page_writeback(page);
+                        put_page(page);
+                        kmem_cache_free(io_page_cachep, io_end->pages[i]);
+                }
+                /*
+                 * If this is a partial write which happened to make
+                 * all buffers uptodate then we can optimize away a
+                 * bogus readpage() for the next read(). Here we
+                 * 'discover' whether the page went uptodate as a
+                 * result of this (potentially partial) write.
+                 */
+                if (!partial_write)
+                        SetPageUptodate(page);
+        }
+        io_end->num_io_pages = 0;
+        /* Add the io_end to per-inode completed io list*/
+        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+        list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+        wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+        /* queue the work to convert unwritten extents to written */
+        queue_work(wq, &io_end->work);
+}
+void ext4_io_submit(struct ext4_io_submit *io)
+{
+        struct bio *bio = io->io_bio;
+        if (bio) {
+                bio_get(io->io_bio);
+                submit_bio(io->io_op, io->io_bio);
+                BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+                bio_put(io->io_bio);
+        }
+        io->io_bio = 0;
+        io->io_op = 0;
+        io->io_end = 0;
+}
+static int io_submit_init(struct ext4_io_submit *io,
+                          struct inode *inode,
+                          struct writeback_control *wbc,
+                          struct buffer_head *bh)
+{
+        ext4_io_end_t *io_end;
+        struct page *page = bh->b_page;
+        int nvecs = bio_get_nr_vecs(bh->b_bdev);
+        struct bio *bio;
+        io_end = ext4_init_io_end(inode, GFP_NOFS);
+        if (!io_end)
+                return -ENOMEM;
+        do {
+                bio = bio_alloc(GFP_NOIO, nvecs);
+                nvecs >>= 1;
+        } while (bio == NULL);
+        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+        bio->bi_bdev = bh->b_bdev;
+        bio->bi_private = io->io_end = io_end;
+        bio->bi_end_io = ext4_end_bio;
+        io_end->inode = inode;
+        io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
+        io->io_bio = bio;
+        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
+                        WRITE_SYNC_PLUG : WRITE);
+        io->io_next_block = bh->b_blocknr;
+        return 0;
+}
+static int io_submit_add_bh(struct ext4_io_submit *io,
+                            struct ext4_io_page *io_page,
+                            struct inode *inode,
+                            struct writeback_control *wbc,
+                            struct buffer_head *bh)
+{
+        ext4_io_end_t *io_end;
+        int ret;
+        if (buffer_new(bh)) {
+                clear_buffer_new(bh);
+                unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+        }
+        if (!buffer_mapped(bh) || buffer_delay(bh)) {
+                if (!buffer_mapped(bh))
+                        clear_buffer_dirty(bh);
+                if (io->io_bio)
+                        ext4_io_submit(io);
+                return 0;
+        }
+        if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+submit_and_retry:
+                ext4_io_submit(io);
+        }
+        if (io->io_bio == NULL) {
+                ret = io_submit_init(io, inode, wbc, bh);
+                if (ret)
+                        return ret;
+        }
+        io_end = io->io_end;
+        if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
+            (io_end->pages[io_end->num_io_pages-1] != io_page))
+                goto submit_and_retry;
+        if (buffer_uninit(bh))
+                io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
+        io->io_end->size += bh->b_size;
+        io->io_next_block++;
+        ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+        if (ret != bh->b_size)
+                goto submit_and_retry;
+        if ((io_end->num_io_pages == 0) ||
+            (io_end->pages[io_end->num_io_pages-1] != io_page)) {
+                io_end->pages[io_end->num_io_pages++] = io_page;
+                io_page->p_count++;
+        }
+        return 0;
+}
+int ext4_bio_write_page(struct ext4_io_submit *io,
+                        struct page *page,
+                        int len,
+                        struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        unsigned block_start, block_end, blocksize;
+        struct ext4_io_page *io_page;
+        struct buffer_head *bh, *head;
+        int ret = 0;
+        blocksize = 1 << inode->i_blkbits;
+        BUG_ON(PageWriteback(page));
+        set_page_writeback(page);
+        ClearPageError(page);
+        io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
+        if (!io_page) {
+                set_page_dirty(page);
+                unlock_page(page);
+                return -ENOMEM;
+        }
+        io_page->p_page = page;
+        io_page->p_count = 0;
+        get_page(page);
+        for (bh = head = page_buffers(page), block_start = 0;
+             bh != head || !block_start;
+             block_start = block_end, bh = bh->b_this_page) {
+                block_end = block_start + blocksize;
+                if (block_start >= len) {
+                        clear_buffer_dirty(bh);
+                        set_buffer_uptodate(bh);
+                        continue;
+                }
+                ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+                if (ret) {
+                        /*
+                         * We only get here on ENOMEM.  Not much else
+                         * we can do but mark the page as dirty, and
+                         * better luck next time.
+                         */
+                        set_page_dirty(page);
+                        break;
+                }
+        }
+        unlock_page(page);
+        /*
+         * If the page was truncated before we could do the writeback,
+         * or we had a memory allocation error while trying to write
+         * the first buffer head, we won't have submitted any pages for
+         * I/O.  In that case we need to make sure we've cleared the
+         * PageWriteback bit from the page to prevent the system from
+         * wedging later on.
+         */
+        if (io_page->p_count == 0) {
+                put_page(page);
+                end_page_writeback(page);
+                kmem_cache_free(io_page_cachep, io_page);
+        }
+        return ret;
+}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ca5c8aa00a2f..dc963929de65 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -226,23 +226,13 @@ static int setup_new_group_blocks(struct super_block *sb,
        }
        /* Zero out all of the reserved backup group descriptor table blocks */
-        for (i = 0, bit = gdblocks + 1, block = start + bit;
+        ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
-             i < reserved_gdb; i++, block++, bit++) {
+                        block, sbi->s_itb_per_group);
-                struct buffer_head *gdb;
+        err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
+                               GFP_NOFS);
-                ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
+        if (err)
+                goto exit_bh;
-                if ((err = extend_or_restart_transaction(handle, 1, bh)))
-                        goto exit_bh;
-                if (IS_ERR(gdb = bclean(handle, sb, block))) {
-                        err = PTR_ERR(gdb);
-                        goto exit_bh;
-                }
-                ext4_handle_dirty_metadata(handle, NULL, gdb);
-                ext4_set_bit(bit, bh->b_data);
-                brelse(gdb);
-        }
        ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
                   input->block_bitmap - start);
        ext4_set_bit(input->block_bitmap - start, bh->b_data);
@@ -251,28 +241,18 @@ static int setup_new_group_blocks(struct super_block *sb,
        ext4_set_bit(input->inode_bitmap - start, bh->b_data);
        /* Zero out all of the inode table blocks */
-        for (i = 0, block = input->inode_table, bit = block - start;
+        block = input->inode_table;
-             i < sbi->s_itb_per_group; i++, bit++, block++) {
+        ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
-                struct buffer_head *it;
+                        block, sbi->s_itb_per_group);
+        err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
-                ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
+        if (err)
+                goto exit_bh;
-                if ((err = extend_or_restart_transaction(handle, 1, bh)))
-                        goto exit_bh;
-                if (IS_ERR(it = bclean(handle, sb, block))) {
-                        err = PTR_ERR(it);
-                        goto exit_bh;
-                }
-                ext4_handle_dirty_metadata(handle, NULL, it);
-                brelse(it);
-                ext4_set_bit(bit, bh->b_data);
-        }
        if ((err = extend_or_restart_transaction(handle, 2, bh)))
                goto exit_bh;
-        mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
+        ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
+                             bh->b_data);
        ext4_handle_dirty_metadata(handle, NULL, bh);
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
@@ -283,8 +263,8 @@ static int setup_new_group_blocks(struct super_block *sb,
                goto exit_journal;
        }
-        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
-                        bh->b_data);
+                             bh->b_data);
        ext4_handle_dirty_metadata(handle, NULL, bh);
 exit_bh:
        brelse(bh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8ecc1e590303..40131b777af6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -40,6 +40,9 @@
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -49,8 +52,11 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
-struct proc_dir_entry *ext4_proc_root;
+static struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
+struct ext4_lazy_init *ext4_li_info;
+struct mutex ext4_li_mtx;
+struct ext4_features *ext4_feat;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
@@ -67,14 +73,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
-static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data, struct vfsmount *mnt);
+                       const char *dev_name, void *data);
+static void ext4_destroy_lazyinit_thread(void);
+static void ext4_unregister_li_request(struct super_block *sb);
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext3",
-        .get_sb         = ext4_get_sb,
+        .mount          = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -701,6 +709,7 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
+        ext4_unregister_li_request(sb);
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        flush_workqueue(sbi->dio_unwritten_wq);
@@ -717,6 +726,7 @@ static void ext4_put_super(struct super_block *sb)
                        ext4_abort(sb, "Couldn't clean up the journal");
        }
+        del_timer(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);
@@ -1042,6 +1052,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
                seq_puts(seq, ",block_validity");
+        if (!test_opt(sb, INIT_INODE_TABLE))
+                seq_puts(seq, ",noinit_inode_table");
+        else if (sbi->s_li_wait_mult)
+                seq_printf(seq, ",init_inode_table=%u",
+                           (unsigned) sbi->s_li_wait_mult);
        ext4_show_quota_options(seq, sb);
        return 0;
@@ -1170,6 +1186,7 @@ static const struct super_operations ext4_sops = {
        .quota_write    = ext4_quota_write,
 #endif
        .bdev_try_to_free_page = bdev_try_to_free_page,
+        .trim_fs        = ext4_trim_fs
 };
 static const struct super_operations ext4_nojournal_sops = {
@@ -1216,6 +1233,7 @@ enum {
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard,
+        Opt_init_inode_table, Opt_noinit_inode_table,
 };
 static const match_table_t tokens = {
@@ -1286,6 +1304,9 @@ static const match_table_t tokens = {
        {Opt_dioread_lock, "dioread_lock"},
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
+        {Opt_init_inode_table, "init_itable=%u"},
+        {Opt_init_inode_table, "init_itable"},
+        {Opt_noinit_inode_table, "noinit_itable"},
        {Opt_err, NULL},
 };
@@ -1756,6 +1777,20 @@ set_qf_format:
                case Opt_dioread_lock:
                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
                        break;
+                case Opt_init_inode_table:
+                        set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                        if (args[0].from) {
+                                if (match_int(&args[0], &option))
+                                        return 0;
+                        } else
+                                option = EXT4_DEF_LI_WAIT_MULT;
+                        if (option < 0)
+                                return 0;
+                        sbi->s_li_wait_mult = option;
+                        break;
+                case Opt_noinit_inode_table:
+                        clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                        break;
                default:
                        ext4_msg(sb, KERN_ERR,
                               "Unrecognized mount option \"%s\" "
@@ -1939,7 +1974,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
 }
 /* Called at mount-time, super-block is locked */
-static int ext4_check_descriptors(struct super_block *sb)
+static int ext4_check_descriptors(struct super_block *sb,
+                                  ext4_group_t *first_not_zeroed)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1948,7 +1984,7 @@ static int ext4_check_descriptors(struct super_block *sb)
        ext4_fsblk_t inode_bitmap;
        ext4_fsblk_t inode_table;
        int flexbg_flag = 0;
-        ext4_group_t i;
+        ext4_group_t i, grp = sbi->s_groups_count;
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                flexbg_flag = 1;
@@ -1964,6 +2000,10 @@ static int ext4_check_descriptors(struct super_block *sb)
                        last_block = first_block +
                                (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+                if ((grp == sbi->s_groups_count) &&
+                   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                        grp = i;
                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -2001,6 +2041,8 @@ static int ext4_check_descriptors(struct super_block *sb)
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
+        if (NULL != first_not_zeroed)
+                *first_not_zeroed = grp;
        ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
        sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@ -2373,6 +2415,7 @@ static struct ext4_attr ext4_attr_##_name = {			\
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
 #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
 #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)       \
@@ -2409,6 +2452,16 @@ static struct attribute *ext4_attrs[] = {
        NULL,
 };
+/* Features this copy of ext4 supports */
+EXT4_INFO_ATTR(lazy_itable_init);
+EXT4_INFO_ATTR(batched_discard);
+static struct attribute *ext4_feat_attrs[] = {
+        ATTR_LIST(lazy_itable_init),
+        ATTR_LIST(batched_discard),
+        NULL,
+};
 static ssize_t ext4_attr_show(struct kobject *kobj,
                              struct attribute *attr, char *buf)
 {
@@ -2437,7 +2490,6 @@ static void ext4_sb_release(struct kobject *kobj)
        complete(&sbi->s_kobj_unregister);
 }
 static const struct sysfs_ops ext4_attr_ops = {
        .show   = ext4_attr_show,
        .store  = ext4_attr_store,
@@ -2449,6 +2501,17 @@ static struct kobj_type ext4_ktype = {
        .release        = ext4_sb_release,
 };
+static void ext4_feat_release(struct kobject *kobj)
+{
+        complete(&ext4_feat->f_kobj_unregister);
+}
+static struct kobj_type ext4_feat_ktype = {
+        .default_attrs  = ext4_feat_attrs,
+        .sysfs_ops      = &ext4_attr_ops,
+        .release        = ext4_feat_release,
+};
 /*
 * Check whether this filesystem can be mounted based on
 * the features present and the RDONLY/RDWR mount requested.
@@ -2539,6 +2602,372 @@ static void print_daily_error_info(unsigned long arg)
        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
 }
+static void ext4_lazyinode_timeout(unsigned long data)
+{
+        struct task_struct *p = (struct task_struct *)data;
+        wake_up_process(p);
+}
+/* Find next suitable group and run ext4_init_inode_table */
+static int ext4_run_li_request(struct ext4_li_request *elr)
+{
+        struct ext4_group_desc *gdp = NULL;
+        ext4_group_t group, ngroups;
+        struct super_block *sb;
+        unsigned long timeout = 0;
+        int ret = 0;
+        sb = elr->lr_super;
+        ngroups = EXT4_SB(sb)->s_groups_count;
+        for (group = elr->lr_next_group; group < ngroups; group++) {
+                gdp = ext4_get_group_desc(sb, group, NULL);
+                if (!gdp) {
+                        ret = 1;
+                        break;
+                }
+                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                        break;
+        }
+        if (group == ngroups)
+                ret = 1;
+        if (!ret) {
+                timeout = jiffies;
+                ret = ext4_init_inode_table(sb, group,
+                                            elr->lr_timeout ? 0 : 1);
+                if (elr->lr_timeout == 0) {
+                        timeout = jiffies - timeout;
+                        if (elr->lr_sbi->s_li_wait_mult)
+                                timeout *= elr->lr_sbi->s_li_wait_mult;
+                        else
+                                timeout *= 20;
+                        elr->lr_timeout = timeout;
+                }
+                elr->lr_next_sched = jiffies + elr->lr_timeout;
+                elr->lr_next_group = group + 1;
+        }
+        return ret;
+}
+/*
+ * Remove lr_request from the list_request and free the
+ * request tructure. Should be called with li_list_mtx held
+ */
+static void ext4_remove_li_request(struct ext4_li_request *elr)
+{
+        struct ext4_sb_info *sbi;
+        if (!elr)
+                return;
+        sbi = elr->lr_sbi;
+        list_del(&elr->lr_request);
+        sbi->s_li_request = NULL;
+        kfree(elr);
+}
+static void ext4_unregister_li_request(struct super_block *sb)
+{
+        struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
+        if (!ext4_li_info)
+                return;
+        mutex_lock(&ext4_li_info->li_list_mtx);
+        ext4_remove_li_request(elr);
+        mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+/*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+ * When such a fs is found, run the lazy initialization request
+ * (ext4_rn_li_request) and keep track of the time spend in this
+ * function. Based on that time we compute next schedule time of
+ * the request. When walking through the list is complete, compute
+ * next waking time and put itself into sleep.
+ */
+static int ext4_lazyinit_thread(void *arg)
+{
+        struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+        struct list_head *pos, *n;
+        struct ext4_li_request *elr;
+        unsigned long next_wakeup;
+        DEFINE_WAIT(wait);
+        int ret;
+        BUG_ON(NULL == eli);
+        eli->li_timer.data = (unsigned long)current;
+        eli->li_timer.function = ext4_lazyinode_timeout;
+        eli->li_task = current;
+        wake_up(&eli->li_wait_task);
+cont_thread:
+        while (true) {
+                next_wakeup = MAX_JIFFY_OFFSET;
+                mutex_lock(&eli->li_list_mtx);
+                if (list_empty(&eli->li_request_list)) {
+                        mutex_unlock(&eli->li_list_mtx);
+                        goto exit_thread;
+                }
+                list_for_each_safe(pos, n, &eli->li_request_list) {
+                        elr = list_entry(pos, struct ext4_li_request,
+                                         lr_request);
+                        if (time_after_eq(jiffies, elr->lr_next_sched))
+                                ret = ext4_run_li_request(elr);
+                        if (ret) {
+                                ret = 0;
+                                ext4_remove_li_request(elr);
+                                continue;
+                        }
+                        if (time_before(elr->lr_next_sched, next_wakeup))
+                                next_wakeup = elr->lr_next_sched;
+                }
+                mutex_unlock(&eli->li_list_mtx);
+                if (freezing(current))
+                        refrigerator();
+                if (time_after_eq(jiffies, next_wakeup)) {
+                        cond_resched();
+                        continue;
+                }
+                eli->li_timer.expires = next_wakeup;
+                add_timer(&eli->li_timer);
+                prepare_to_wait(&eli->li_wait_daemon, &wait,
+                                TASK_INTERRUPTIBLE);
+                if (time_before(jiffies, next_wakeup))
+                        schedule();
+                finish_wait(&eli->li_wait_daemon, &wait);
+        }
+exit_thread:
+        /*
+         * It looks like the request list is empty, but we need
+         * to check it under the li_list_mtx lock, to prevent any
+         * additions into it, and of course we should lock ext4_li_mtx
+         * to atomically free the list and ext4_li_info, because at
+         * this point another ext4 filesystem could be registering
+         * new one.
+         */
+        mutex_lock(&ext4_li_mtx);
+        mutex_lock(&eli->li_list_mtx);
+        if (!list_empty(&eli->li_request_list)) {
+                mutex_unlock(&eli->li_list_mtx);
+                mutex_unlock(&ext4_li_mtx);
+                goto cont_thread;
+        }
+        mutex_unlock(&eli->li_list_mtx);
+        del_timer_sync(&ext4_li_info->li_timer);
+        eli->li_task = NULL;
+        wake_up(&eli->li_wait_task);
+        kfree(ext4_li_info);
+        ext4_li_info = NULL;
+        mutex_unlock(&ext4_li_mtx);
+        return 0;
+}
+static void ext4_clear_request_list(void)
+{
+        struct list_head *pos, *n;
+        struct ext4_li_request *elr;
+        mutex_lock(&ext4_li_info->li_list_mtx);
+        if (list_empty(&ext4_li_info->li_request_list))
+                return;
+        list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+                elr = list_entry(pos, struct ext4_li_request,
+                                 lr_request);
+                ext4_remove_li_request(elr);
+        }
+        mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+static int ext4_run_lazyinit_thread(void)
+{
+        struct task_struct *t;
+        t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+        if (IS_ERR(t)) {
+                int err = PTR_ERR(t);
+                ext4_clear_request_list();
+                del_timer_sync(&ext4_li_info->li_timer);
+                kfree(ext4_li_info);
+                ext4_li_info = NULL;
+                printk(KERN_CRIT "EXT4: error %d creating inode table "
+                                 "initialization thread\n",
+                                 err);
+                return err;
+        }
+        ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+        wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
+        return 0;
+}
+/*
+ * Check whether it make sense to run itable init. thread or not.
+ * If there is at least one uninitialized inode table, return
+ * corresponding group number, else the loop goes through all
+ * groups and return total number of groups.
+ */
+static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+{
+        ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+        struct ext4_group_desc *gdp = NULL;
+        for (group = 0; group < ngroups; group++) {
+                gdp = ext4_get_group_desc(sb, group, NULL);
+                if (!gdp)
+                        continue;
+                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                        break;
+        }
+        return group;
+}
+static int ext4_li_info_new(void)
+{
+        struct ext4_lazy_init *eli = NULL;
+        eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+        if (!eli)
+                return -ENOMEM;
+        eli->li_task = NULL;
+        INIT_LIST_HEAD(&eli->li_request_list);
+        mutex_init(&eli->li_list_mtx);
+        init_waitqueue_head(&eli->li_wait_daemon);
+        init_waitqueue_head(&eli->li_wait_task);
+        init_timer(&eli->li_timer);
+        eli->li_state |= EXT4_LAZYINIT_QUIT;
+        ext4_li_info = eli;
+        return 0;
+}
+static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+                                            ext4_group_t start)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_li_request *elr;
+        unsigned long rnd;
+        elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+        if (!elr)
+                return NULL;
+        elr->lr_super = sb;
+        elr->lr_sbi = sbi;
+        elr->lr_next_group = start;
+        /*
+         * Randomize first schedule time of the request to
+         * spread the inode table initialization requests
+         * better.
+         */
+        get_random_bytes(&rnd, sizeof(rnd));
+        elr->lr_next_sched = jiffies + (unsigned long)rnd %
+                             (EXT4_DEF_LI_MAX_START_DELAY * HZ);
+        return elr;
+}
+static int ext4_register_li_request(struct super_block *sb,
+                                    ext4_group_t first_not_zeroed)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_li_request *elr;
+        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+        int ret;
+        if (sbi->s_li_request != NULL)
+                return 0;
+        if (first_not_zeroed == ngroups ||
+            (sb->s_flags & MS_RDONLY) ||
+            !test_opt(sb, INIT_INODE_TABLE)) {
+                sbi->s_li_request = NULL;
+                return 0;
+        }
+        if (first_not_zeroed == ngroups) {
+                sbi->s_li_request = NULL;
+                return 0;
+        }
+        elr = ext4_li_request_new(sb, first_not_zeroed);
+        if (!elr)
+                return -ENOMEM;
+        mutex_lock(&ext4_li_mtx);
+        if (NULL == ext4_li_info) {
+                ret = ext4_li_info_new();
+                if (ret)
+                        goto out;
+        }
+        mutex_lock(&ext4_li_info->li_list_mtx);
+        list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+        mutex_unlock(&ext4_li_info->li_list_mtx);
+        sbi->s_li_request = elr;
+        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+                ret = ext4_run_lazyinit_thread();
+                if (ret)
+                        goto out;
+        }
+out:
+        mutex_unlock(&ext4_li_mtx);
+        if (ret)
+                kfree(elr);
+        return ret;
+}
+/*
+ * We do not need to lock anything since this is called on
+ * module unload.
+ */
+static void ext4_destroy_lazyinit_thread(void)
+{
+        /*
+         * If thread exited earlier
+         * there's nothing to be done.
+         */
+        if (!ext4_li_info)
+                return;
+        ext4_clear_request_list();
+        while (ext4_li_info->li_task) {
+                wake_up(&ext4_li_info->li_wait_daemon);
+                wait_event(ext4_li_info->li_wait_task,
+                           ext4_li_info->li_task == NULL);
+        }
+}
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
@@ -2564,6 +2993,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        __u64 blocks_count;
        int err;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+        ext4_group_t first_not_zeroed;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
@@ -2624,6 +3054,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+        set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
                set_opt(sbi->s_mount_opt, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
@@ -2901,7 +3332,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        goto failed_mount2;
                }
        }
-        if (!ext4_check_descriptors(sb)) {
+        if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                goto failed_mount2;
        }
@@ -3122,6 +3553,10 @@ no_journal:
                goto failed_mount4;
        }
+        err = ext4_register_li_request(sb, first_not_zeroed);
+        if (err)
+                goto failed_mount4;
        sbi->s_kobj.kset = ext4_kset;
        init_completion(&sbi->s_kobj_unregister);
        err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@ -3461,7 +3896,7 @@ static int ext4_load_journal(struct super_block *sb,
        EXT4_SB(sb)->s_journal = journal;
        ext4_clear_journal_err(sb, es);
-        if (journal_devnum &&
+        if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
@@ -3514,9 +3949,12 @@ static int ext4_commit_super(struct super_block *sb, int sync)
        else
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+        if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter))
+                ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeblocks_counter));
-        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+        if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
+                es->s_free_inodes_count =
+                        cpu_to_le32(percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeinodes_counter));
        sb->s_dirt = 0;
        BUFFER_TRACE(sbh, "marking dirty");
@@ -3835,6 +4273,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        enable_quota = 1;
                }
        }
+        /*
+         * Reinitialize lazy itable initialization thread based on
+         * current settings
+         */
+        if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+                ext4_unregister_li_request(sb);
+        else {
+                ext4_group_t first_not_zeroed;
+                first_not_zeroed = ext4_has_uninit_itable(sb);
+                ext4_register_li_request(sb, first_not_zeroed);
+        }
        ext4_setup_system_zone(sb);
        if (sbi->s_journal == NULL)
                ext4_commit_super(sb, 1);
@@ -4216,17 +4667,17 @@ out:
 #endif
-static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data, struct vfsmount *mnt)
+                       const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
 }
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext2",
-        .get_sb         = ext4_get_sb,
+        .mount          = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -4271,28 +4722,58 @@ static inline void unregister_as_ext3(void) { }
 static struct file_system_type ext4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext4",
-        .get_sb         = ext4_get_sb,
+        .mount          = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
-static int __init init_ext4_fs(void)
+int __init ext4_init_feat_adverts(void)
+{
+        struct ext4_features *ef;
+        int ret = -ENOMEM;
+        ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+        if (!ef)
+                goto out;
+        ef->f_kobj.kset = ext4_kset;
+        init_completion(&ef->f_kobj_unregister);
+        ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+                                   "features");
+        if (ret) {
+                kfree(ef);
+                goto out;
+        }
+        ext4_feat = ef;
+        ret = 0;
+out:
+        return ret;
+}
+static int __init ext4_init_fs(void)
 {
        int err;
        ext4_check_flag_values();
-        err = init_ext4_system_zone();
+        err = ext4_init_pageio();
        if (err)
                return err;
+        err = ext4_init_system_zone();
+        if (err)
+                goto out5;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
                goto out4;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
-        err = init_ext4_mballoc();
+        err = ext4_init_feat_adverts();
+        err = ext4_init_mballoc();
        if (err)
                goto out3;
-        err = init_ext4_xattr();
+        err = ext4_init_xattr();
        if (err)
                goto out2;
        err = init_inodecache();
@@ -4303,38 +4784,46 @@ static int __init init_ext4_fs(void)
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;
+        ext4_li_info = NULL;
+        mutex_init(&ext4_li_mtx);
        return 0;
 out:
        unregister_as_ext2();
        unregister_as_ext3();
        destroy_inodecache();
 out1:
-        exit_ext4_xattr();
+        ext4_exit_xattr();
 out2:
-        exit_ext4_mballoc();
+        ext4_exit_mballoc();
 out3:
+        kfree(ext4_feat);
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
 out4:
-        exit_ext4_system_zone();
+        ext4_exit_system_zone();
+out5:
+        ext4_exit_pageio();
        return err;
 }
-static void __exit exit_ext4_fs(void)
+static void __exit ext4_exit_fs(void)
 {
+        ext4_destroy_lazyinit_thread();
        unregister_as_ext2();
        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        destroy_inodecache();
-        exit_ext4_xattr();
+        ext4_exit_xattr();
-        exit_ext4_mballoc();
+        ext4_exit_mballoc();
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
-        exit_ext4_system_zone();
+        ext4_exit_system_zone();
+        ext4_exit_pageio();
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
 MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
-module_init(init_ext4_fs)
+module_init(ext4_init_fs)
-module_exit(exit_ext4_fs)
+module_exit(ext4_exit_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a8cd8dff1ad..fa4b899da4b3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1588,7 +1588,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 #undef BLOCK_HASH_SHIFT
 int __init
-init_ext4_xattr(void)
+ext4_init_xattr(void)
 {
        ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
        if (!ext4_xattr_cache)
@@ -1597,7 +1597,7 @@ init_ext4_xattr(void)
 }
 void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
        if (ext4_xattr_cache)
                mb_cache_destroy(ext4_xattr_cache);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 518e96e43905..1ef16520b950 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -83,8 +83,8 @@ extern void ext4_xattr_put_super(struct super_block *);
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                            struct ext4_inode *raw_inode, handle_t *handle);
-extern int init_ext4_xattr(void);
+extern int __init ext4_init_xattr(void);
-extern void exit_ext4_xattr(void);
+extern void ext4_exit_xattr(void);
 extern const struct xattr_handler *ext4_xattr_handlers[];
@@ -121,14 +121,14 @@ ext4_xattr_put_super(struct super_block *sb)
 {
 }
-static inline int
+static __init inline int
-init_ext4_xattr(void)
+ext4_init_xattr(void)
 {
        return 0;
 }
 static inline void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
 }
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index bbca5c186ae7..3345aabd1dd7 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -675,18 +675,17 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
-static int msdos_get_sb(struct file_system_type *fs_type,
+static struct dentry *msdos_mount(struct file_system_type *fs_type,
                        int flags, const char *dev_name,
-                        void *data, struct vfsmount *mnt)
+                        void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
-                           mnt);
 }
 static struct file_system_type msdos_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "msdos",
-        .get_sb         = msdos_get_sb,
+        .mount          = msdos_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6f0f6c9a0152..b936703b8924 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1071,18 +1071,17 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
-static int vfat_get_sb(struct file_system_type *fs_type,
+static struct dentry *vfat_mount(struct file_system_type *fs_type,
                       int flags, const char *dev_name,
-                       void *data, struct vfsmount *mnt)
+                       void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
-                           mnt);
 }
 static struct file_system_type vfat_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "vfat",
-        .get_sb         = vfat_get_sb,
+        .mount          = vfat_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f8cc34f542c3..ecc8b3954ed6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -640,7 +640,7 @@ static void fasync_free_rcu(struct rcu_head *head)
 * match the state "is the filp on a fasync list".
 *
 */
-static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
+int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 {
        struct fasync_struct *fa, **fp;
        int result = 0;
@@ -666,21 +666,31 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
        return result;
 }
+struct fasync_struct *fasync_alloc(void)
+{
+        return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+}
 /*
- * Add a fasync entry. Return negative on error, positive if
+ * NOTE! This can be used only for unused fasync entries:
- * added, and zero if did nothing but change an existing one.
+ * entries that actually got inserted on the fasync list
+ * need to be released by rcu - see fasync_remove_entry.
+ */
+void fasync_free(struct fasync_struct *new)
+{
+        kmem_cache_free(fasync_cache, new);
+}
+/*
+ * Insert a new entry into the fasync list.  Return the pointer to the
+ * old one if we didn't use the new one.
 *
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 */
-static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
 {
-        struct fasync_struct *new, *fa, **fp;
+        struct fasync_struct *fa, **fp;
-        int result = 0;
-        new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
-        if (!new)
-                return -ENOMEM;
        spin_lock(&filp->f_lock);
        spin_lock(&fasync_lock);
@@ -691,8 +701,6 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
                spin_lock_irq(&fa->fa_lock);
                fa->fa_fd = fd;
                spin_unlock_irq(&fa->fa_lock);
-                kmem_cache_free(fasync_cache, new);
                goto out;
        }
@@ -702,13 +710,39 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
        new->fa_fd = fd;
        new->fa_next = *fapp;
        rcu_assign_pointer(*fapp, new);
-        result = 1;
        filp->f_flags |= FASYNC;
 out:
        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
-        return result;
+        return fa;
+}
+/*
+ * Add a fasync entry. Return negative on error, positive if
+ * added, and zero if did nothing but change an existing one.
+ */
+static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+{
+        struct fasync_struct *new;
+        new = fasync_alloc();
+        if (!new)
+                return -ENOMEM;
+        /*
+         * fasync_insert_entry() returns the old (update) entry if
+         * it existed.
+         *
+         * So free the (unused) new entry and return 0 to let the
+         * caller know that we didn't add any new fasync entries.
+         */
+        if (fasync_insert_entry(fd, filp, fapp, new)) {
+                fasync_free(new);
+                return 0;
+        }
+        return 1;
 }
 /*
diff --git a/fs/file_table.c b/fs/file_table.c
index a04bdd81c11c..c3dee381f1b4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -60,7 +60,7 @@ static inline void file_free(struct file *f)
 /*
 * Return the total number of open files in the system
 */
-static int get_nr_files(void)
+static long get_nr_files(void)
 {
        return percpu_counter_read_positive(&nr_files);
 }
@@ -68,7 +68,7 @@ static int get_nr_files(void)
 /*
 * Return the maximum number of open files in the system
 */
-int get_max_files(void)
+unsigned long get_max_files(void)
 {
        return files_stat.max_files;
 }
@@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        files_stat.nr_files = get_nr_files();
-        return proc_dointvec(table, write, buffer, lenp, ppos);
+        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #else
 int proc_nr_files(ctl_table *table, int write,
@@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write,
 struct file *get_empty_filp(void)
 {
        const struct cred *cred = current_cred();
-        static int old_max;
+        static long old_max;
        struct file * f;
        /*
@@ -140,8 +140,7 @@ struct file *get_empty_filp(void)
 over:
        /* Ran out of filps - report that */
        if (get_nr_files() > old_max) {
-                printk(KERN_INFO "VFS: file-max limit %d reached\n",
+                pr_info("VFS: file-max limit %lu reached\n", get_max_files());
-                                        get_max_files());
                old_max = get_nr_files();
        }
        goto fail;
@@ -487,7 +486,7 @@ retry:
 void __init files_init(unsigned long mempages)
 { 
-        int n; 
+        unsigned long n;
        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages)
         */ 
        n = (mempages * (PAGE_SIZE / 1024)) / 10;
-        files_stat.max_files = n; 
+        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-        if (files_stat.max_files < NR_FILE)
-                files_stat.max_files = NR_FILE;
        files_defer_init();
        lg_lock_init(files_lglock);
        percpu_counter_init(&nr_files, 0);
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 79d1b4ea13e7..8c04eac5079d 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -260,6 +260,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
        struct inode                    *ip = NULL;
        if ((ip = new_inode(sbp))) {
+                ip->i_ino = get_next_ino();
                vxfs_iinit(ip, vip);
                ip->i_mapping->a_ops = &vxfs_aops;
        }
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 71b0148b8784..9d1c99558389 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -246,17 +246,16 @@ out:
 /*
 * The usual module blurb.
 */
-static int vxfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *vxfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
-                           mnt);
 }
 static struct file_system_type vxfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "vxfs",
-        .get_sb         = vxfs_get_sb,
+        .mount          = vxfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ab38fef1c9a1..aed881a76b22 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -79,6 +79,11 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
        return sb->s_bdi;
 }
+static inline struct inode *wb_inode(struct list_head *head)
+{
+        return list_entry(head, struct inode, i_wb_list);
+}
 static void bdi_queue_work(struct backing_dev_info *bdi,
                struct wb_writeback_work *work)
 {
@@ -172,11 +177,11 @@ static void redirty_tail(struct inode *inode)
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;
-                tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+                tail = wb_inode(wb->b_dirty.next);
                if (time_before(inode->dirtied_when, tail->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
-        list_move(&inode->i_list, &wb->b_dirty);
+        list_move(&inode->i_wb_list, &wb->b_dirty);
 }
 /*
@@ -186,7 +191,7 @@ static void requeue_io(struct inode *inode)
 {
        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-        list_move(&inode->i_list, &wb->b_more_io);
+        list_move(&inode->i_wb_list, &wb->b_more_io);
 }
 static void inode_sync_complete(struct inode *inode)
@@ -227,14 +232,14 @@ static void move_expired_inodes(struct list_head *delaying_queue,
        int do_sb_sort = 0;
        while (!list_empty(delaying_queue)) {
-                inode = list_entry(delaying_queue->prev, struct inode, i_list);
+                inode = wb_inode(delaying_queue->prev);
                if (older_than_this &&
                    inode_dirtied_after(inode, *older_than_this))
                        break;
                if (sb && sb != inode->i_sb)
                        do_sb_sort = 1;
                sb = inode->i_sb;
-                list_move(&inode->i_list, &tmp);
+                list_move(&inode->i_wb_list, &tmp);
        }
        /* just one sb in list, splice to dispatch_queue and we're done */
@@ -245,12 +250,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
        /* Move inodes from one superblock together */
        while (!list_empty(&tmp)) {
-                inode = list_entry(tmp.prev, struct inode, i_list);
+                sb = wb_inode(tmp.prev)->i_sb;
-                sb = inode->i_sb;
                list_for_each_prev_safe(pos, node, &tmp) {
-                        inode = list_entry(pos, struct inode, i_list);
+                        inode = wb_inode(pos);
                        if (inode->i_sb == sb)
-                                list_move(&inode->i_list, dispatch_queue);
+                                list_move(&inode->i_wb_list, dispatch_queue);
                }
        }
 }
@@ -408,16 +412,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                         * completion.
                         */
                        redirty_tail(inode);
-                } else if (atomic_read(&inode->i_count)) {
-                        /*
-                         * The inode is clean, inuse
-                         */
-                        list_move(&inode->i_list, &inode_in_use);
                } else {
                        /*
-                         * The inode is clean, unused
+                         * The inode is clean.  At this point we either have
+                         * a reference to the inode or it's on it's way out.
+                         * No need to add it back to the LRU.
                         */
-                        list_move(&inode->i_list, &inode_unused);
+                        list_del_init(&inode->i_wb_list);
                }
        }
        inode_sync_complete(inode);
@@ -465,8 +466,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 {
        while (!list_empty(&wb->b_io)) {
                long pages_skipped;
-                struct inode *inode = list_entry(wb->b_io.prev,
+                struct inode *inode = wb_inode(wb->b_io.prev);
-                                                 struct inode, i_list);
                if (inode->i_sb != sb) {
                        if (only_this_sb) {
@@ -487,10 +487,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                        return 0;
                }
-                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
+                /*
+                 * Don't bother with new inodes or inodes beeing freed, first
+                 * kind does not need peridic writeout yet, and for the latter
+                 * kind writeout is handled by the freer.
+                 */
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
                }
                /*
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
@@ -498,7 +504,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                if (inode_dirtied_after(inode, wbc->wb_start))
                        return 1;
-                BUG_ON(inode->i_state & I_FREEING);
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
                writeback_single_inode(inode, wbc);
@@ -536,8 +541,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
                queue_io(wb, wbc->older_than_this);
        while (!list_empty(&wb->b_io)) {
-                struct inode *inode = list_entry(wb->b_io.prev,
+                struct inode *inode = wb_inode(wb->b_io.prev);
-                                                 struct inode, i_list);
                struct super_block *sb = inode->i_sb;
                if (!pin_sb_for_writeback(sb)) {
@@ -582,7 +586,7 @@ static inline bool over_bground_thresh(void)
        global_dirty_limits(&background_thresh, &dirty_thresh);
        return (global_page_state(NR_FILE_DIRTY) +
-                global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+                global_page_state(NR_UNSTABLE_NFS) > background_thresh);
 }
 /*
@@ -675,8 +679,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                 */
                spin_lock(&inode_lock);
                if (!list_empty(&wb->b_more_io))  {
-                        inode = list_entry(wb->b_more_io.prev,
+                        inode = wb_inode(wb->b_more_io.prev);
-                                                struct inode, i_list);
                        trace_wbc_writeback_wait(&wbc, wb->bdi);
                        inode_wait_for_writeback(inode);
                }
@@ -721,9 +724,13 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
                return 0;
        wb->last_old_flush = jiffies;
+        /*
+         * Add in the number of potentially dirty inodes, because each inode
+         * write can dirty pagecache in the underlying blockdev.
+         */
        nr_pages = global_page_state(NR_FILE_DIRTY) +
                        global_page_state(NR_UNSTABLE_NFS) +
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+                        get_nr_dirty_inodes();
        if (nr_pages) {
                struct wb_writeback_work work = {
@@ -790,7 +797,7 @@ int bdi_writeback_thread(void *data)
        struct backing_dev_info *bdi = wb->bdi;
        long pages_written;
-        current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+        current->flags |= PF_SWAPWRITE;
        set_freezable();
        wb->last_active = jiffies;
@@ -962,7 +969,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 * dirty list.  Add blockdev inodes as well.
                 */
                if (!S_ISBLK(inode->i_mode)) {
-                        if (hlist_unhashed(&inode->i_hash))
+                        if (inode_unhashed(inode))
                                goto out;
                }
                if (inode->i_state & I_FREEING)
@@ -990,7 +997,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                        }
                        inode->dirtied_when = jiffies;
-                        list_move(&inode->i_list, &bdi->wb.b_dirty);
+                        list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
                }
        }
 out:
@@ -1090,8 +1097,7 @@ void writeback_inodes_sb(struct super_block *sb)
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-        work.nr_pages = nr_dirty + nr_unstable +
+        work.nr_pages = nr_dirty + nr_unstable + get_nr_dirty_inodes();
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        bdi_queue_work(sb->s_bdi, &work);
        wait_for_completion(&done);
@@ -1198,3 +1204,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
        return ret;
 }
 EXPORT_SYMBOL(sync_inode);
+/**
+ * sync_inode - write an inode to disk
+ * @inode: the inode to sync
+ * @wait: wait for I/O to complete.
+ *
+ * Write an inode to disk and adjust it's dirty state after completion.
+ *
+ * Note: only writes the actual inode, no associated data or other metadata.
+ */
+int sync_inode_metadata(struct inode *inode, int wait)
+{
+        struct writeback_control wbc = {
+                .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+                .nr_to_write = 0, /* metadata-only */
+        };
+        return sync_inode(inode, &wbc);
+}
+EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 7367e177186f..85542a7daf40 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -222,6 +222,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
        if (!inode)
                return NULL;
+        inode->i_ino = get_next_ino();
        inode->i_mode = mode;
        inode->i_uid = fc->user_id;
        inode->i_gid = fc->group_id;
@@ -321,12 +322,10 @@ static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
-static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type,
-                        const char *dev_name, void *raw_data,
+                        int flags, const char *dev_name, void *raw_data)
-                        struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, raw_data,
+        return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super);
-                                fuse_ctl_fill_super, mnt);
 }
 static void fuse_ctl_kill_sb(struct super_block *sb)
@@ -345,7 +344,7 @@ static void fuse_ctl_kill_sb(struct super_block *sb)
 static struct file_system_type fuse_ctl_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fusectl",
-        .get_sb         = fuse_ctl_get_sb,
+        .mount          = fuse_ctl_mount,
        .kill_sb        = fuse_ctl_kill_sb,
 };
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cde755cca564..6e07696308dc 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -809,11 +809,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
        int err;
        struct page *page = *pagep;
-        if (page && zeroing && count < PAGE_SIZE) {
+        if (page && zeroing && count < PAGE_SIZE)
-                void *mapaddr = kmap_atomic(page, KM_USER1);
+                clear_highpage(page);
-                memset(mapaddr, 0, PAGE_SIZE);
-                kunmap_atomic(mapaddr, KM_USER1);
-        }
        while (count) {
                if (cs->write && cs->pipebufs && page) {
                        return fuse_ref_page(cs, page, offset, count);
@@ -830,10 +828,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
                        }
                }
                if (page) {
-                        void *mapaddr = kmap_atomic(page, KM_USER1);
+                        void *mapaddr = kmap_atomic(page, KM_USER0);
                        void *buf = mapaddr + offset;
                        offset += fuse_copy_do(cs, &buf, &count);
-                        kunmap_atomic(mapaddr, KM_USER1);
+                        kunmap_atomic(mapaddr, KM_USER0);
                } else
                        offset += fuse_copy_do(cs, NULL, &count);
        }
@@ -1336,12 +1334,7 @@ out_finish:
 static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-        int i;
+        release_pages(req->pages, req->num_pages, 0);
-        for (i = 0; i < req->num_pages; i++) {
-                struct page *page = req->pages[i];
-                page_cache_release(page);
-        }
 }
 static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index da9e6e11374c..cfce3ad86a92 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1041,11 +1041,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        return err;
 }
-static int fuse_get_sb(struct file_system_type *fs_type,
+static struct dentry *fuse_mount(struct file_system_type *fs_type,
                       int flags, const char *dev_name,
-                       void *raw_data, struct vfsmount *mnt)
+                       void *raw_data)
 {
-        return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
+        return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
 }
 static void fuse_kill_sb_anon(struct super_block *sb)
@@ -1065,17 +1065,16 @@ static struct file_system_type fuse_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fuse",
        .fs_flags       = FS_HAS_SUBTYPE,
-        .get_sb         = fuse_get_sb,
+        .mount          = fuse_mount,
        .kill_sb        = fuse_kill_sb_anon,
 };
 #ifdef CONFIG_BLOCK
-static int fuse_get_sb_blk(struct file_system_type *fs_type,
+static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
                           int flags, const char *dev_name,
-                           void *raw_data, struct vfsmount *mnt)
+                           void *raw_data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
-                           mnt);
 }
 static void fuse_kill_sb_blk(struct super_block *sb)
@@ -1094,7 +1093,7 @@ static void fuse_kill_sb_blk(struct super_block *sb)
 static struct file_system_type fuseblk_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fuseblk",
-        .get_sb         = fuse_get_sb_blk,
+        .mount          = fuse_mount_blk,
        .kill_sb        = fuse_kill_sb_blk,
        .fs_flags       = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
 };
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 6b24afb96aae..4f36f8832b9b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -618,7 +618,6 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        struct gfs2_alloc *al = NULL;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
-        unsigned to = from + len;
        struct page *page;
        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -691,7 +690,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        }
 prepare_write:
-        error = block_prepare_write(page, from, to, gfs2_block_map);
+        error = __block_write_begin(page, from, len, gfs2_block_map);
 out:
        if (error == 0)
                return 0;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index f3b071f921aa..939739c7b3f9 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
                 * activity, but those code paths have their own higher-level
                 * throttling.
                 */
-                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else if (!trylock_buffer(bh)) {
                        redirty_page_for_writepage(wbc, page);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index aeafc233dc89..3eb1393f7b81 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1219,7 +1219,6 @@ fail_sb:
 fail_locking:
        init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-        invalidate_inodes(sb);
        gfs2_gl_hash_clear(sdp);
        gfs2_lm_unmount(sdp);
 fail_sys:
@@ -1251,12 +1250,11 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
 }
 /**
- * gfs2_get_sb - Get the GFS2 superblock
+ * gfs2_mount - Get the GFS2 superblock
 * @fs_type: The GFS2 filesystem type
 * @flags: Mount flags
 * @dev_name: The name of the device
 * @data: The mount arguments
- * @mnt: The vfsmnt for this mount
 *
 * Q. Why not use get_sb_bdev() ?
 * A. We need to select one of two root directories to mount, independent
@@ -1265,8 +1263,8 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
 * Returns: 0 or -ve on error
 */
-static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data, struct vfsmount *mnt)
+                       const char *dev_name, void *data)
 {
        struct block_device *bdev;
        struct super_block *s;
@@ -1280,7 +1278,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        bdev = open_bdev_exclusive(dev_name, mode, fs_type);
        if (IS_ERR(bdev))
-                return PTR_ERR(bdev);
+                return ERR_CAST(bdev);
        /*
         * once the super is inserted into the list by sget, s_umount
@@ -1299,6 +1297,9 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        if (IS_ERR(s))
                goto error_bdev;
+        if (s->s_root)
+                close_bdev_exclusive(bdev, mode);
        memset(&args, 0, sizeof(args));
        args.ar_quota = GFS2_QUOTA_DEFAULT;
        args.ar_data = GFS2_DATA_DEFAULT;
@@ -1310,17 +1311,13 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        error = gfs2_mount_args(&args, data);
        if (error) {
                printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
-                if (s->s_root)
+                goto error_super;
-                        goto error_super;
-                deactivate_locked_super(s);
-                return error;
        }
        if (s->s_root) {
                error = -EBUSY;
                if ((flags ^ s->s_flags) & MS_RDONLY)
                        goto error_super;
-                close_bdev_exclusive(bdev, mode);
        } else {
                char b[BDEVNAME_SIZE];
@@ -1329,27 +1326,24 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
                sb_set_blocksize(s, block_size(bdev));
                error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
-                if (error) {
+                if (error)
-                        deactivate_locked_super(s);
+                        goto error_super;
-                        return error;
-                }
                s->s_flags |= MS_ACTIVE;
                bdev->bd_super = s;
        }
        sdp = s->s_fs_info;
-        mnt->mnt_sb = s;
        if (args.ar_meta)
-                mnt->mnt_root = dget(sdp->sd_master_dir);
+                return dget(sdp->sd_master_dir);
        else
-                mnt->mnt_root = dget(sdp->sd_root_dir);
+                return dget(sdp->sd_root_dir);
-        return 0;
 error_super:
        deactivate_locked_super(s);
+        return ERR_PTR(error);
 error_bdev:
        close_bdev_exclusive(bdev, mode);
-        return error;
+        return ERR_PTR(error);
 }
 static int set_meta_super(struct super_block *s, void *ptr)
@@ -1357,8 +1351,8 @@ static int set_meta_super(struct super_block *s, void *ptr)
        return -EINVAL;
 }
-static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
+static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
-                            const char *dev_name, void *data, struct vfsmount *mnt)
+                        int flags, const char *dev_name, void *data)
 {
        struct super_block *s;
        struct gfs2_sbd *sdp;
@@ -1369,23 +1363,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
        if (error) {
                printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
                       dev_name, error);
-                return error;
+                return ERR_PTR(error);
        }
        s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
                 path.dentry->d_inode->i_sb->s_bdev);
        path_put(&path);
        if (IS_ERR(s)) {
                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        }
        if ((flags ^ s->s_flags) & MS_RDONLY) {
                deactivate_locked_super(s);
-                return -EBUSY;
+                return ERR_PTR(-EBUSY);
        }
        sdp = s->s_fs_info;
-        mnt->mnt_sb = s;
+        return dget(sdp->sd_master_dir);
-        mnt->mnt_root = dget(sdp->sd_master_dir);
-        return 0;
 }
 static void gfs2_kill_sb(struct super_block *sb)
@@ -1411,7 +1403,7 @@ static void gfs2_kill_sb(struct super_block *sb)
 struct file_system_type gfs2_fs_type = {
        .name = "gfs2",
        .fs_flags = FS_REQUIRES_DEV,
-        .get_sb = gfs2_get_sb,
+        .mount = gfs2_mount,
        .kill_sb = gfs2_kill_sb,
        .owner = THIS_MODULE,
 };
@@ -1419,7 +1411,7 @@ struct file_system_type gfs2_fs_type = {
 struct file_system_type gfs2meta_fs_type = {
        .name = "gfs2meta",
        .fs_flags = FS_REQUIRES_DEV,
-        .get_sb = gfs2_get_sb_meta,
+        .mount = gfs2_mount_meta,
        .owner = THIS_MODULE,
 };
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 0534510200d5..12cbea7502c2 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -255,7 +255,7 @@ out_parent:
        gfs2_holder_uninit(ghs);
        gfs2_holder_uninit(ghs + 1);
        if (!error) {
-                atomic_inc(&inode->i_count);
+                ihold(inode);
                d_instantiate(dentry, inode);
                mark_inode_dirty(inode);
        }
@@ -1294,7 +1294,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
        int error;
        if (!page_has_buffers(page)) {
-                error = block_prepare_write(page, from, to, gfs2_block_map);
+                error = __block_write_begin(page, from, to - from, gfs2_block_map);
                if (unlikely(error))
                        return error;
@@ -1313,7 +1313,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
                next += bh->b_size;
                if (buffer_mapped(bh)) {
                        if (end) {
-                                error = block_prepare_write(page, start, end,
+                                error = __block_write_begin(page, start, end - start,
                                                            gfs2_block_map);
                                if (unlikely(error))
                                        return error;
@@ -1328,7 +1328,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
        } while (next < to);
        if (end) {
-                error = block_prepare_write(page, start, end, gfs2_block_map);
+                error = __block_write_begin(page, start, end - start, gfs2_block_map);
                if (unlikely(error))
                        return error;
                empty_write_end(page, start, end);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 047d1176096c..2b2c4997430b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -857,7 +857,6 @@ restart:
        gfs2_clear_rgrpd(sdp);
        gfs2_jindex_free(sdp);
        /*  Take apart glock structures and buffer lists  */
-        invalidate_inodes(sdp->sd_vfs);
        gfs2_gl_hash_clear(sdp);
        /*  Unmount the locking protocol  */
        gfs2_lm_unmount(sdp);
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 4f55651aaa51..c8cffb81e849 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -147,8 +147,6 @@ struct hfs_sb_info {
        u16 blockoffset;
        int fs_div;
-        struct hlist_head rsrc_inodes;
 };
 #define HFS_FLG_BITMAP_DIRTY    0
@@ -254,17 +252,6 @@ static inline void hfs_bitmap_dirty(struct super_block *sb)
        sb->s_dirt = 1;
 }
-static inline void hfs_buffer_sync(struct buffer_head *bh)
-{
-        while (buffer_locked(bh)) {
-                wait_on_buffer(bh);
-        }
-        if (buffer_dirty(bh)) {
-                ll_rw_block(WRITE, 1, &bh);
-                wait_on_buffer(bh);
-        }
-}
 #define sb_bread512(sb, sec, data) ({                   \
        struct buffer_head *__bh;                       \
        sector_t __block;                               \
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 397b7adc7ce6..dffb4e996643 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -524,7 +524,7 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
        HFS_I(inode)->rsrc_inode = dir;
        HFS_I(dir)->rsrc_inode = inode;
        igrab(dir);
-        hlist_add_head(&inode->i_hash, &HFS_SB(dir->i_sb)->rsrc_inodes);
+        hlist_add_fake(&inode->i_hash);
        mark_inode_dirty(inode);
 out:
        d_add(dentry, inode);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 86428f5ac991..1563d5ce5764 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -220,7 +220,7 @@ int hfs_mdb_get(struct super_block *sb)
                mdb->drLsMod = hfs_mtime();
                mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
-                hfs_buffer_sync(HFS_SB(sb)->mdb_bh);
+                sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
        }
        return 0;
@@ -287,7 +287,7 @@ void hfs_mdb_commit(struct super_block *sb)
                HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
                HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
                mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
-                hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh);
+                sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
        }
        if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 33254160f650..4824c27cebb8 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -382,7 +382,6 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
                return -ENOMEM;
        sb->s_fs_info = sbi;
-        INIT_HLIST_HEAD(&sbi->rsrc_inodes);
        res = -EINVAL;
        if (!parse_options((char *)data, sbi)) {
@@ -442,17 +441,16 @@ bail:
        return res;
 }
-static int hfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *hfs_mount(struct file_system_type *fs_type,
-                      int flags, const char *dev_name, void *data,
+                      int flags, const char *dev_name, void *data)
-                      struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
 }
 static struct file_system_type hfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "hfs",
-        .get_sb         = hfs_get_sb,
+        .mount          = hfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d236d85ec9d7..9d59c0571f59 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -286,7 +286,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
        inc_nlink(inode);
        hfsplus_instantiate(dst_dentry, inode, cnid);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        inode->i_ctime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
        sbi->file_count++;
@@ -317,8 +317,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
                res = hfsplus_rename_cat(inode->i_ino,
                                         dir, &dentry->d_name,
                                         sbi->hidden_dir, &str);
-                if (!res)
+                if (!res) {
                        inode->i_flags |= S_DEAD;
+                        drop_nlink(inode);
+                }
                goto out;
        }
        res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 78449280dae0..8afd7e84f98d 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -211,7 +211,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
         * appear hashed, but do not put on any lists.  hlist_del()
         * will work fine and require no locking.
         */
-        inode->i_hash.pprev = &inode->i_hash.next;
+        hlist_add_fake(&inode->i_hash);
        mark_inode_dirty(inode);
 out:
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 5b4667e08ef7..40a85a3ded6e 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -92,7 +92,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
        mark_inode_dirty(inode);
 out_unlock_inode:
-        mutex_lock(&inode->i_mutex);
+        mutex_unlock(&inode->i_mutex);
 out_drop_write:
        mnt_drop_write(file->f_path.mnt);
 out:
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9a88d7536103..52cc746d3ba3 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -495,18 +495,16 @@ static void hfsplus_destroy_inode(struct inode *inode)
 #define HFSPLUS_INODE_SIZE      sizeof(struct hfsplus_inode_info)
-static int hfsplus_get_sb(struct file_system_type *fs_type,
+static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
-                          int flags, const char *dev_name, void *data,
+                          int flags, const char *dev_name, void *data)
-                          struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
-                           mnt);
 }
 static struct file_system_type hfsplus_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "hfsplus",
-        .get_sb         = hfsplus_get_sb,
+        .mount          = hfsplus_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 7c232c1487ee..bf15a43016b9 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -91,7 +91,6 @@ extern int rename_file(char *from, char *to);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
                     long long *bfree_out, long long *bavail_out,
                     long long *files_out, long long *ffree_out,
-                     void *fsid_out, int fsid_size, long *namelen_out,
+                     void *fsid_out, int fsid_size, long *namelen_out);
-                     long *spare_out);
 #endif
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index f7dc9b5f9ef8..2c0f148a49e6 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -217,7 +217,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
        err = do_statfs(dentry->d_sb->s_fs_info,
                        &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
                        &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
-                        &sf->f_namelen, sf->f_spare);
+                        &sf->f_namelen);
        if (err)
                return err;
        sf->f_blocks = f_blocks;
@@ -962,11 +962,11 @@ out:
        return err;
 }
-static int hostfs_read_sb(struct file_system_type *type,
+static struct dentry *hostfs_read_sb(struct file_system_type *type,
                          int flags, const char *dev_name,
-                          void *data, struct vfsmount *mnt)
+                          void *data)
 {
-        return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt);
+        return mount_nodev(type, flags, data, hostfs_fill_sb_common);
 }
 static void hostfs_kill_sb(struct super_block *s)
@@ -978,7 +978,7 @@ static void hostfs_kill_sb(struct super_block *s)
 static struct file_system_type hostfs_type = {
        .owner          = THIS_MODULE,
        .name           = "hostfs",
-        .get_sb         = hostfs_read_sb,
+        .mount          = hostfs_read_sb,
        .kill_sb        = hostfs_kill_sb,
        .fs_flags       = 0,
 };
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 6777aa06ce2c..d51a98384bc0 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -94,8 +94,7 @@ void *open_dir(char *path, int *err_out)
        dir = opendir(path);
        *err_out = errno;
-        if (dir == NULL)
-                return NULL;
        return dir;
 }
@@ -205,7 +204,7 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
        if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
                if (fd >= 0) {
                        if (fchmod(fd, attrs->ia_mode) != 0)
-                                return (-errno);
+                                return -errno;
                } else if (chmod(file, attrs->ia_mode) != 0) {
                        return -errno;
                }
@@ -364,8 +363,7 @@ int rename_file(char *from, char *to)
 int do_statfs(char *root, long *bsize_out, long long *blocks_out,
              long long *bfree_out, long long *bavail_out,
              long long *files_out, long long *ffree_out,
-              void *fsid_out, int fsid_size, long *namelen_out,
+              void *fsid_out, int fsid_size, long *namelen_out)
-              long *spare_out)
 {
        struct statfs64 buf;
        int err;
@@ -384,10 +382,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out,
               sizeof(buf.f_fsid) > fsid_size ? fsid_size :
               sizeof(buf.f_fsid));
        *namelen_out = buf.f_namelen;
-        spare_out[0] = buf.f_spare[0];
-        spare_out[1] = buf.f_spare[1];
-        spare_out[2] = buf.f_spare[2];
-        spare_out[3] = buf.f_spare[3];
-        spare_out[4] = buf.f_spare[4];
        return 0;
 }
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index c969a1aa163a..bb69389972eb 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -686,17 +686,16 @@ bail0:
        return -EINVAL;
 }
-static int hpfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *hpfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
-                           mnt);
 }
 static struct file_system_type hpfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "hpfs",
-        .get_sb         = hpfs_get_sb,
+        .mount          = hpfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 4e2a45ea6140..f702b5f713fc 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -748,17 +748,17 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
        return(err);
 }
-static int hppfs_read_super(struct file_system_type *type,
+static struct dentry *hppfs_read_super(struct file_system_type *type,
                            int flags, const char *dev_name,
-                            void *data, struct vfsmount *mnt)
+                            void *data)
 {
-        return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt);
+        return mount_nodev(type, flags, data, hppfs_fill_super);
 }
 static struct file_system_type hppfs_type = {
        .owner          = THIS_MODULE,
        .name           = "hppfs",
-        .get_sb         = hppfs_read_super,
+        .mount          = hppfs_read_super,
        .kill_sb        = kill_anon_super,
        .fs_flags       = 0,
 };
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 113eba3d3c38..d6cfac1f0a40 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/magic.h>
+#include <linux/migrate.h>
 #include <asm/uaccess.h>
@@ -455,6 +456,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
        inode = new_inode(sb);
        if (inode) {
                struct hugetlbfs_inode_info *info;
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_uid = uid;
                inode->i_gid = gid;
@@ -573,6 +575,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
        return 0;
 }
+static int hugetlbfs_migrate_page(struct address_space *mapping,
+                                struct page *newpage, struct page *page)
+{
+        int rc;
+        rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+        if (rc)
+                return rc;
+        migrate_page_copy(newpage, page);
+        return 0;
+}
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -659,6 +674,7 @@ static const struct address_space_operations hugetlbfs_aops = {
        .write_begin    = hugetlbfs_write_begin,
        .write_end      = hugetlbfs_write_end,
        .set_page_dirty = hugetlbfs_set_page_dirty,
+        .migratepage    = hugetlbfs_migrate_page,
 };
@@ -880,15 +896,15 @@ void hugetlb_put_quota(struct address_space *mapping, long delta)
        }
 }
-static int hugetlbfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
 }
 static struct file_system_type hugetlbfs_fs_type = {
        .name           = "hugetlbfs",
-        .get_sb         = hugetlbfs_get_sb,
+        .mount          = hugetlbfs_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/inode.c b/fs/inode.c
index 86464332e590..ae2727ab0c3a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -24,11 +24,11 @@
 #include <linux/mount.h>
 #include <linux/async.h>
 #include <linux/posix_acl.h>
+#include <linux/ima.h>
 /*
 * This is needed for the following functions:
 *  - inode_has_buffers
- *  - invalidate_inode_buffers
 *  - invalidate_bdev
 *
 * FIXME: remove all knowledge of the buffer layer from this file
@@ -72,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly;
 * allowing for low-overhead inode sync() operations.
 */
-LIST_HEAD(inode_in_use);
+static LIST_HEAD(inode_lru);
-LIST_HEAD(inode_unused);
 static struct hlist_head *inode_hashtable __read_mostly;
 /*
@@ -103,8 +102,41 @@ static DECLARE_RWSEM(iprune_sem);
 */
 struct inodes_stat_t inodes_stat;
+static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
+static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
 static struct kmem_cache *inode_cachep __read_mostly;
+static inline int get_nr_inodes(void)
+{
+        return percpu_counter_sum_positive(&nr_inodes);
+}
+static inline int get_nr_inodes_unused(void)
+{
+        return percpu_counter_sum_positive(&nr_inodes_unused);
+}
+int get_nr_dirty_inodes(void)
+{
+        int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+        return nr_dirty > 0 ? nr_dirty : 0;
+}
+/*
+ * Handle nr_inode sysctl
+ */
+#ifdef CONFIG_SYSCTL
+int proc_nr_inodes(ctl_table *table, int write,
+                   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        inodes_stat.nr_inodes = get_nr_inodes();
+        inodes_stat.nr_unused = get_nr_inodes_unused();
+        return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
 static void wake_up_inode(struct inode *inode)
 {
        /*
@@ -192,6 +224,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_fsnotify_mask = 0;
 #endif
+        percpu_counter_inc(&nr_inodes);
        return 0;
 out:
        return -ENOMEM;
@@ -232,11 +266,13 @@ void __destroy_inode(struct inode *inode)
        if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
                posix_acl_release(inode->i_default_acl);
 #endif
+        percpu_counter_dec(&nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
-void destroy_inode(struct inode *inode)
+static void destroy_inode(struct inode *inode)
 {
+        BUG_ON(!list_empty(&inode->i_lru));
        __destroy_inode(inode);
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
@@ -255,6 +291,8 @@ void inode_init_once(struct inode *inode)
        INIT_HLIST_NODE(&inode->i_hash);
        INIT_LIST_HEAD(&inode->i_dentry);
        INIT_LIST_HEAD(&inode->i_devices);
+        INIT_LIST_HEAD(&inode->i_wb_list);
+        INIT_LIST_HEAD(&inode->i_lru);
        INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
        spin_lock_init(&inode->i_data.tree_lock);
        spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -281,14 +319,109 @@ static void init_once(void *foo)
 */
 void __iget(struct inode *inode)
 {
-        if (atomic_inc_return(&inode->i_count) != 1)
+        atomic_inc(&inode->i_count);
-                return;
+}
+/*
+ * get additional reference to inode; caller must already hold one.
+ */
+void ihold(struct inode *inode)
+{
+        WARN_ON(atomic_inc_return(&inode->i_count) < 2);
+}
+EXPORT_SYMBOL(ihold);
+static void inode_lru_list_add(struct inode *inode)
+{
+        if (list_empty(&inode->i_lru)) {
+                list_add(&inode->i_lru, &inode_lru);
+                percpu_counter_inc(&nr_inodes_unused);
+        }
+}
-        if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+static void inode_lru_list_del(struct inode *inode)
-                list_move(&inode->i_list, &inode_in_use);
+{
-        inodes_stat.nr_unused--;
+        if (!list_empty(&inode->i_lru)) {
+                list_del_init(&inode->i_lru);
+                percpu_counter_dec(&nr_inodes_unused);
+        }
+}
+static inline void __inode_sb_list_add(struct inode *inode)
+{
+        list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
 }
+/**
+ * inode_sb_list_add - add inode to the superblock list of inodes
+ * @inode: inode to add
+ */
+void inode_sb_list_add(struct inode *inode)
+{
+        spin_lock(&inode_lock);
+        __inode_sb_list_add(inode);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_sb_list_add);
+static inline void __inode_sb_list_del(struct inode *inode)
+{
+        list_del_init(&inode->i_sb_list);
+}
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+        unsigned long tmp;
+        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+                        L1_CACHE_BYTES;
+        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+        return tmp & I_HASHMASK;
+}
+/**
+ *      __insert_inode_hash - hash an inode
+ *      @inode: unhashed inode
+ *      @hashval: unsigned long value used to locate this object in the
+ *              inode_hashtable.
+ *
+ *      Add an inode to the inode hash for this superblock.
+ */
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+        struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
+        spin_lock(&inode_lock);
+        hlist_add_head(&inode->i_hash, b);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(__insert_inode_hash);
+/**
+ *      __remove_inode_hash - remove an inode from the hash
+ *      @inode: inode to unhash
+ *
+ *      Remove an inode from the superblock.
+ */
+static void __remove_inode_hash(struct inode *inode)
+{
+        hlist_del_init(&inode->i_hash);
+}
+/**
+ *      remove_inode_hash - remove an inode from the hash
+ *      @inode: inode to unhash
+ *
+ *      Remove an inode from the superblock.
+ */
+void remove_inode_hash(struct inode *inode)
+{
+        spin_lock(&inode_lock);
+        hlist_del_init(&inode->i_hash);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(remove_inode_hash);
 void end_writeback(struct inode *inode)
 {
        might_sleep();
@@ -327,101 +460,113 @@ static void evict(struct inode *inode)
 */
 static void dispose_list(struct list_head *head)
 {
-        int nr_disposed = 0;
        while (!list_empty(head)) {
                struct inode *inode;
-                inode = list_first_entry(head, struct inode, i_list);
+                inode = list_first_entry(head, struct inode, i_lru);
-                list_del(&inode->i_list);
+                list_del_init(&inode->i_lru);
                evict(inode);
                spin_lock(&inode_lock);
-                hlist_del_init(&inode->i_hash);
+                __remove_inode_hash(inode);
-                list_del_init(&inode->i_sb_list);
+                __inode_sb_list_del(inode);
                spin_unlock(&inode_lock);
                wake_up_inode(inode);
                destroy_inode(inode);
-                nr_disposed++;
        }
-        spin_lock(&inode_lock);
-        inodes_stat.nr_inodes -= nr_disposed;
-        spin_unlock(&inode_lock);
 }
-/*
+/**
- * Invalidate all inodes for a device.
+ * evict_inodes - evict all evictable inodes for a superblock
+ * @sb:         superblock to operate on
+ *
+ * Make sure that no inodes with zero refcount are retained.  This is
+ * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * so any inode reaching zero refcount during or after that call will
+ * be immediately evicted.
 */
-static int invalidate_list(struct list_head *head, struct list_head *dispose)
+void evict_inodes(struct super_block *sb)
 {
-        struct list_head *next;
+        struct inode *inode, *next;
-        int busy = 0, count = 0;
+        LIST_HEAD(dispose);
-        next = head->next;
-        for (;;) {
-                struct list_head *tmp = next;
-                struct inode *inode;
-                /*
+        down_write(&iprune_sem);
-                 * We can reschedule here without worrying about the list's
-                 * consistency because the per-sb list of inodes must not
-                 * change during umount anymore, and because iprune_sem keeps
-                 * shrink_icache_memory() away.
-                 */
-                cond_resched_lock(&inode_lock);
-                next = next->next;
+        spin_lock(&inode_lock);
-                if (tmp == head)
+        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-                        break;
+                if (atomic_read(&inode->i_count))
-                inode = list_entry(tmp, struct inode, i_sb_list);
-                if (inode->i_state & I_NEW)
                        continue;
-                invalidate_inode_buffers(inode);
-                if (!atomic_read(&inode->i_count)) {
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-                        list_move(&inode->i_list, dispose);
+                        WARN_ON(1);
-                        WARN_ON(inode->i_state & I_NEW);
-                        inode->i_state |= I_FREEING;
-                        count++;
                        continue;
                }
-                busy = 1;
+                inode->i_state |= I_FREEING;
+                /*
+                 * Move the inode off the IO lists and LRU once I_FREEING is
+                 * set so that it won't get moved back on there if it is dirty.
+                 */
+                list_move(&inode->i_lru, &dispose);
+                list_del_init(&inode->i_wb_list);
+                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+                        percpu_counter_dec(&nr_inodes_unused);
        }
-        /* only unused inodes may be cached with i_count zero */
+        spin_unlock(&inode_lock);
-        inodes_stat.nr_unused -= count;
-        return busy;
+        dispose_list(&dispose);
+        up_write(&iprune_sem);
 }
 /**
- *      invalidate_inodes       - discard the inodes on a device
+ * invalidate_inodes    - attempt to free all inodes on a superblock
- *      @sb: superblock
+ * @sb:         superblock to operate on
 *
- *      Discard all of the inodes for a given superblock. If the discard
+ * Attempts to free all inodes for a given superblock.  If there were any
- *      fails because there are busy inodes then a non zero value is returned.
+ * busy inodes return a non-zero value, else zero.
- *      If the discard is successful all the inodes have been discarded.
 */
 int invalidate_inodes(struct super_block *sb)
 {
-        int busy;
+        int busy = 0;
-        LIST_HEAD(throw_away);
+        struct inode *inode, *next;
+        LIST_HEAD(dispose);
        down_write(&iprune_sem);
        spin_lock(&inode_lock);
-        fsnotify_unmount_inodes(&sb->s_inodes);
+        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-        busy = invalidate_list(&sb->s_inodes, &throw_away);
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+                        continue;
+                if (atomic_read(&inode->i_count)) {
+                        busy = 1;
+                        continue;
+                }
+                inode->i_state |= I_FREEING;
+                /*
+                 * Move the inode off the IO lists and LRU once I_FREEING is
+                 * set so that it won't get moved back on there if it is dirty.
+                 */
+                list_move(&inode->i_lru, &dispose);
+                list_del_init(&inode->i_wb_list);
+                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+                        percpu_counter_dec(&nr_inodes_unused);
+        }
        spin_unlock(&inode_lock);
-        dispose_list(&throw_away);
+        dispose_list(&dispose);
        up_write(&iprune_sem);
        return busy;
 }
-EXPORT_SYMBOL(invalidate_inodes);
 static int can_unuse(struct inode *inode)
 {
-        if (inode->i_state)
+        if (inode->i_state & ~I_REFERENCED)
                return 0;
        if (inode_has_buffers(inode))
                return 0;
@@ -433,22 +578,24 @@ static int can_unuse(struct inode *inode)
 }
 /*
- * Scan `goal' inodes on the unused list for freeable ones. They are moved to
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * a temporary list and then are freed outside inode_lock by dispose_list().
+ * temporary list and then are freed outside inode_lock by dispose_list().
 *
 * Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed.  We expect the final iput() on that inode to add it to
+ * pagecache removed.  If the inode has metadata buffers attached to
- * the front of the inode_unused list.  So look for it there and if the
+ * mapping->private_list then try to remove them.
- * inode is still freeable, proceed.  The right inode is found 99.9% of the
- * time in testing on a 4-way.
 *
- * If the inode has metadata buffers attached to mapping->private_list then
+ * If the inode has the I_REFERENCED flag set, then it means that it has been
- * try to remove them.
+ * used recently - the flag is set in iput_final(). When we encounter such an
+ * inode, clear the flag and move it to the back of the LRU so it gets another
+ * pass through the LRU before it gets reclaimed. This is necessary because of
+ * the fact we are doing lazy LRU updates to minimise lock contention so the
+ * LRU does not have strict ordering. Hence we don't want to reclaim inodes
+ * with this flag set because they are the inodes that are out of order.
 */
 static void prune_icache(int nr_to_scan)
 {
        LIST_HEAD(freeable);
-        int nr_pruned = 0;
        int nr_scanned;
        unsigned long reap = 0;
@@ -457,13 +604,26 @@ static void prune_icache(int nr_to_scan)
        for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
                struct inode *inode;
-                if (list_empty(&inode_unused))
+                if (list_empty(&inode_lru))
                        break;
-                inode = list_entry(inode_unused.prev, struct inode, i_list);
+                inode = list_entry(inode_lru.prev, struct inode, i_lru);
-                if (inode->i_state || atomic_read(&inode->i_count)) {
+                /*
-                        list_move(&inode->i_list, &inode_unused);
+                 * Referenced or dirty inodes are still in use. Give them
+                 * another pass through the LRU as we canot reclaim them now.
+                 */
+                if (atomic_read(&inode->i_count) ||
+                    (inode->i_state & ~I_REFERENCED)) {
+                        list_del_init(&inode->i_lru);
+                        percpu_counter_dec(&nr_inodes_unused);
+                        continue;
+                }
+                /* recently referenced inodes get one more pass */
+                if (inode->i_state & I_REFERENCED) {
+                        list_move(&inode->i_lru, &inode_lru);
+                        inode->i_state &= ~I_REFERENCED;
                        continue;
                }
                if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@ -475,18 +635,23 @@ static void prune_icache(int nr_to_scan)
                        iput(inode);
                        spin_lock(&inode_lock);
-                        if (inode != list_entry(inode_unused.next,
+                        if (inode != list_entry(inode_lru.next,
-                                                struct inode, i_list))
+                                                struct inode, i_lru))
                                continue;       /* wrong inode or list_empty */
                        if (!can_unuse(inode))
                                continue;
                }
-                list_move(&inode->i_list, &freeable);
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state |= I_FREEING;
-                nr_pruned++;
+                /*
+                 * Move the inode off the IO lists and LRU once I_FREEING is
+                 * set so that it won't get moved back on there if it is dirty.
+                 */
+                list_move(&inode->i_lru, &freeable);
+                list_del_init(&inode->i_wb_list);
+                percpu_counter_dec(&nr_inodes_unused);
        }
-        inodes_stat.nr_unused -= nr_pruned;
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_INODESTEAL, reap);
        else
@@ -518,7 +683,7 @@ static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
                        return -1;
                prune_icache(nr);
        }
-        return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+        return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker icache_shrinker = {
@@ -529,9 +694,6 @@ static struct shrinker icache_shrinker = {
 static void __wait_on_freeing_inode(struct inode *inode);
 /*
 * Called with the inode lock held.
- * NOTE: we are not increasing the inode-refcount, you must call __iget()
- * by hand after calling find_inode now! This simplifies iunique and won't
- * add any additional branch in the common code.
 */
 static struct inode *find_inode(struct super_block *sb,
                                struct hlist_head *head,
@@ -551,9 +713,10 @@ repeat:
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
-                break;
+                __iget(inode);
+                return inode;
        }
-        return node ? inode : NULL;
+        return NULL;
 }
 /*
@@ -576,53 +739,49 @@ repeat:
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
-                break;
+                __iget(inode);
+                return inode;
        }
-        return node ? inode : NULL;
+        return NULL;
-}
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
-        unsigned long tmp;
-        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-                        L1_CACHE_BYTES;
-        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-        return tmp & I_HASHMASK;
-}
-static inline void
-__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
-                        struct inode *inode)
-{
-        inodes_stat.nr_inodes++;
-        list_add(&inode->i_list, &inode_in_use);
-        list_add(&inode->i_sb_list, &sb->s_inodes);
-        if (head)
-                hlist_add_head(&inode->i_hash, head);
 }
-/**
+/*
- * inode_add_to_lists - add a new inode to relevant lists
+ * Each cpu owns a range of LAST_INO_BATCH numbers.
- * @sb: superblock inode belongs to
+ * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
- * @inode: inode to mark in use
+ * to renew the exhausted range.
 *
- * When an inode is allocated it needs to be accounted for, added to the in use
+ * This does not significantly increase overflow rate because every CPU can
- * list, the owning superblock and the inode hash. This needs to be done under
+ * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
- * the inode_lock, so export a function to do this rather than the inode lock
+ * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
- * itself. We calculate the hash list to add to here so it is all internal
+ * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
- * which requires the caller to have already set up the inode number in the
+ * overflow rate by 2x, which does not seem too significant.
- * inode to add.
+ *
+ * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+ * error if st_ino won't fit in target struct field. Use 32bit counter
+ * here to attempt to avoid that.
 */
-void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+#define LAST_INO_BATCH 1024
+static DEFINE_PER_CPU(unsigned int, last_ino);
+unsigned int get_next_ino(void)
 {
-        struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+        unsigned int *p = &get_cpu_var(last_ino);
+        unsigned int res = *p;
-        spin_lock(&inode_lock);
+#ifdef CONFIG_SMP
-        __inode_add_to_lists(sb, head, inode);
+        if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
-        spin_unlock(&inode_lock);
+                static atomic_t shared_last_ino;
+                int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
+                res = next - LAST_INO_BATCH;
+        }
+#endif
+        *p = ++res;
+        put_cpu_var(last_ino);
+        return res;
 }
-EXPORT_SYMBOL_GPL(inode_add_to_lists);
+EXPORT_SYMBOL(get_next_ino);
 /**
 *      new_inode       - obtain an inode
@@ -638,12 +797,6 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
 */
 struct inode *new_inode(struct super_block *sb)
 {
-        /*
-         * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
-         * error if st_ino won't fit in target struct field. Use 32bit counter
-         * here to attempt to avoid that.
-         */
-        static unsigned int last_ino;
        struct inode *inode;
        spin_lock_prefetch(&inode_lock);
@@ -651,8 +804,7 @@ struct inode *new_inode(struct super_block *sb)
        inode = alloc_inode(sb);
        if (inode) {
                spin_lock(&inode_lock);
-                __inode_add_to_lists(sb, NULL, inode);
+                __inode_sb_list_add(inode);
-                inode->i_ino = ++last_ino;
                inode->i_state = 0;
                spin_unlock(&inode_lock);
        }
@@ -663,7 +815,7 @@ EXPORT_SYMBOL(new_inode);
 void unlock_new_inode(struct inode *inode)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-        if (inode->i_mode & S_IFDIR) {
+        if (S_ISDIR(inode->i_mode)) {
                struct file_system_type *type = inode->i_sb->s_type;
                /* Set new key only if filesystem hasn't already changed it */
@@ -720,7 +872,8 @@ static struct inode *get_new_inode(struct super_block *sb,
                        if (set(inode, data))
                                goto set_failed;
-                        __inode_add_to_lists(sb, head, inode);
+                        hlist_add_head(&inode->i_hash, head);
+                        __inode_sb_list_add(inode);
                        inode->i_state = I_NEW;
                        spin_unlock(&inode_lock);
@@ -735,7 +888,6 @@ static struct inode *get_new_inode(struct super_block *sb,
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
-                __iget(old);
                spin_unlock(&inode_lock);
                destroy_inode(inode);
                inode = old;
@@ -767,7 +919,8 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
-                        __inode_add_to_lists(sb, head, inode);
+                        hlist_add_head(&inode->i_hash, head);
+                        __inode_sb_list_add(inode);
                        inode->i_state = I_NEW;
                        spin_unlock(&inode_lock);
@@ -782,7 +935,6 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
-                __iget(old);
                spin_unlock(&inode_lock);
                destroy_inode(inode);
                inode = old;
@@ -791,6 +943,27 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
        return inode;
 }
+/*
+ * search the inode cache for a matching inode number.
+ * If we find one, then the inode number we are trying to
+ * allocate is not unique and so we should not use it.
+ *
+ * Returns 1 if the inode number is unique, 0 if it is not.
+ */
+static int test_inode_iunique(struct super_block *sb, unsigned long ino)
+{
+        struct hlist_head *b = inode_hashtable + hash(sb, ino);
+        struct hlist_node *node;
+        struct inode *inode;
+        hlist_for_each_entry(inode, node, b, i_hash) {
+                if (inode->i_ino == ino && inode->i_sb == sb)
+                        return 0;
+        }
+        return 1;
+}
 /**
 *      iunique - get a unique inode number
 *      @sb: superblock
@@ -812,19 +985,18 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
         * error if st_ino won't fit in target struct field. Use 32bit counter
         * here to attempt to avoid that.
         */
+        static DEFINE_SPINLOCK(iunique_lock);
        static unsigned int counter;
-        struct inode *inode;
-        struct hlist_head *head;
        ino_t res;
        spin_lock(&inode_lock);
+        spin_lock(&iunique_lock);
        do {
                if (counter <= max_reserved)
                        counter = max_reserved + 1;
                res = counter++;
-                head = inode_hashtable + hash(sb, res);
+        } while (!test_inode_iunique(sb, res));
-                inode = find_inode_fast(sb, head, res);
+        spin_unlock(&iunique_lock);
-        } while (inode != NULL);
        spin_unlock(&inode_lock);
        return res;
@@ -876,7 +1048,6 @@ static struct inode *ifind(struct super_block *sb,
        spin_lock(&inode_lock);
        inode = find_inode(sb, head, test, data);
        if (inode) {
-                __iget(inode);
                spin_unlock(&inode_lock);
                if (likely(wait))
                        wait_on_inode(inode);
@@ -909,7 +1080,6 @@ static struct inode *ifind_fast(struct super_block *sb,
        spin_lock(&inode_lock);
        inode = find_inode_fast(sb, head, ino);
        if (inode) {
-                __iget(inode);
                spin_unlock(&inode_lock);
                wait_on_inode(inode);
                return inode;
@@ -1095,7 +1265,7 @@ int insert_inode_locked(struct inode *inode)
                __iget(old);
                spin_unlock(&inode_lock);
                wait_on_inode(old);
-                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
@@ -1134,7 +1304,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
                __iget(old);
                spin_unlock(&inode_lock);
                wait_on_inode(old);
-                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
@@ -1143,36 +1313,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 }
 EXPORT_SYMBOL(insert_inode_locked4);
-/**
- *      __insert_inode_hash - hash an inode
- *      @inode: unhashed inode
- *      @hashval: unsigned long value used to locate this object in the
- *              inode_hashtable.
- *
- *      Add an inode to the inode hash for this superblock.
- */
-void __insert_inode_hash(struct inode *inode, unsigned long hashval)
-{
-        struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
-        spin_lock(&inode_lock);
-        hlist_add_head(&inode->i_hash, head);
-        spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL(__insert_inode_hash);
-/**
- *      remove_inode_hash - remove an inode from the hash
- *      @inode: inode to unhash
- *
- *      Remove an inode from the superblock.
- */
-void remove_inode_hash(struct inode *inode)
-{
-        spin_lock(&inode_lock);
-        hlist_del_init(&inode->i_hash);
-        spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL(remove_inode_hash);
 int generic_delete_inode(struct inode *inode)
 {
@@ -1187,7 +1327,7 @@ EXPORT_SYMBOL(generic_delete_inode);
 */
 int generic_drop_inode(struct inode *inode)
 {
-        return !inode->i_nlink || hlist_unhashed(&inode->i_hash);
+        return !inode->i_nlink || inode_unhashed(inode);
 }
 EXPORT_SYMBOL_GPL(generic_drop_inode);
@@ -1213,10 +1353,11 @@ static void iput_final(struct inode *inode)
                drop = generic_drop_inode(inode);
        if (!drop) {
-                if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-                        list_move(&inode->i_list, &inode_unused);
-                inodes_stat.nr_unused++;
                if (sb->s_flags & MS_ACTIVE) {
+                        inode->i_state |= I_REFERENCED;
+                        if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
+                                inode_lru_list_add(inode);
+                        }
                        spin_unlock(&inode_lock);
                        return;
                }
@@ -1227,19 +1368,23 @@ static void iput_final(struct inode *inode)
                spin_lock(&inode_lock);
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state &= ~I_WILL_FREE;
-                inodes_stat.nr_unused--;
+                __remove_inode_hash(inode);
-                hlist_del_init(&inode->i_hash);
        }
-        list_del_init(&inode->i_list);
-        list_del_init(&inode->i_sb_list);
        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
-        inodes_stat.nr_inodes--;
+        /*
+         * Move the inode off the IO lists and LRU once I_FREEING is
+         * set so that it won't get moved back on there if it is dirty.
+         */
+        inode_lru_list_del(inode);
+        list_del_init(&inode->i_wb_list);
+        __inode_sb_list_del(inode);
        spin_unlock(&inode_lock);
        evict(inode);
-        spin_lock(&inode_lock);
+        remove_inode_hash(inode);
-        hlist_del_init(&inode->i_hash);
-        spin_unlock(&inode_lock);
        wake_up_inode(inode);
        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
        destroy_inode(inode);
@@ -1503,6 +1648,8 @@ void __init inode_init(void)
                                         SLAB_MEM_SPREAD),
                                         init_once);
        register_shrinker(&icache_shrinker);
+        percpu_counter_init(&nr_inodes, 0);
+        percpu_counter_init(&nr_inodes_unused, 0);
        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h
index a6910e91cee8..e43b9a4dbf4e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -101,3 +101,10 @@ extern void put_super(struct super_block *sb);
 struct nameidata;
 extern struct file *nameidata_to_filp(struct nameidata *);
 extern void release_open_intent(struct nameidata *);
+/*
+ * inode.c
+ */
+extern int get_nr_dirty_inodes(void);
+extern void evict_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index f855ea4fc888..e92fdbb3bc3a 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -530,6 +530,41 @@ static int ioctl_fsthaw(struct file *filp)
        return thaw_super(sb);
 }
+static int ioctl_fstrim(struct file *filp, void __user *argp)
+{
+        struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+        struct fstrim_range range;
+        int ret = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        /* If filesystem doesn't support trim feature, return. */
+        if (sb->s_op->trim_fs == NULL)
+                return -EOPNOTSUPP;
+        /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
+        if (sb->s_bdev == NULL)
+                return -EINVAL;
+        if (argp == NULL) {
+                range.start = 0;
+                range.len = ULLONG_MAX;
+                range.minlen = 0;
+        } else if (copy_from_user(&range, argp, sizeof(range)))
+                return -EFAULT;
+        ret = sb->s_op->trim_fs(sb, &range);
+        if (ret < 0)
+                return ret;
+        if ((argp != NULL) &&
+            (copy_to_user(argp, &range, sizeof(range))))
+                return -EFAULT;
+        return 0;
+}
 /*
 * When you add any new common ioctls to the switches above and below
 * please update compat_sys_ioctl() too.
@@ -580,6 +615,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                error = ioctl_fsthaw(filp);
                break;
+        case FITRIM:
+                error = ioctl_fstrim(filp, argp);
+                break;
        case FS_IOC_FIEMAP:
                return ioctl_fiemap(filp, arg);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 09ff41a752a0..bfdeb82a53be 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -544,6 +544,34 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
 }
 /*
+ * Check if root directory is empty (has less than 3 files).
+ *
+ * Used to detect broken CDs where ISO root directory is empty but Joliet root
+ * directory is OK. If such CD has Rock Ridge extensions, they will be disabled
+ * (and Joliet used instead) or else no files would be visible.
+ */
+static bool rootdir_empty(struct super_block *sb, unsigned long block)
+{
+        int offset = 0, files = 0, de_len;
+        struct iso_directory_record *de;
+        struct buffer_head *bh;
+        bh = sb_bread(sb, block);
+        if (!bh)
+                return true;
+        while (files < 3) {
+                de = (struct iso_directory_record *) (bh->b_data + offset);
+                de_len = *(unsigned char *) de;
+                if (de_len == 0)
+                        break;
+                files++;
+                offset += de_len;
+        }
+        brelse(bh);
+        return files < 3;
+}
+/*
 * Initialize the superblock and read the root inode.
 *
 * Note: a check_disk_change() has been done immediately prior
@@ -843,6 +871,18 @@ root_found:
                goto out_no_root;
        /*
+         * Fix for broken CDs with Rock Ridge and empty ISO root directory but
+         * correct Joliet root directory.
+         */
+        if (sbi->s_rock == 1 && joliet_level &&
+                                rootdir_empty(s, sbi->s_firstdatazone)) {
+                printk(KERN_NOTICE
+                        "ISOFS: primary root directory is empty. "
+                        "Disabling Rock Ridge and switching to Joliet.");
+                sbi->s_rock = 0;
+        }
+        /*
         * If this disk has both Rock Ridge and Joliet on it, then we
         * want to use Rock Ridge by default.  This can be overridden
         * by using the norock mount option.  There is still one other
@@ -962,25 +1002,23 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 * or getblk() if they are not.  Returns the number of blocks inserted
 * (-ve == error.)
 */
-int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
+int isofs_get_blocks(struct inode *inode, sector_t iblock,
                     struct buffer_head **bh, unsigned long nblocks)
 {
-        unsigned long b_off;
+        unsigned long b_off = iblock;
        unsigned offset, sect_size;
        unsigned int firstext;
        unsigned long nextblk, nextoff;
-        long iblock = (long)iblock_s;
        int section, rv, error;
        struct iso_inode_info *ei = ISOFS_I(inode);
        error = -EIO;
        rv = 0;
-        if (iblock < 0 || iblock != iblock_s) {
+        if (iblock != b_off) {
                printk(KERN_DEBUG "%s: block number too large\n", __func__);
                goto abort;
        }
-        b_off = iblock;
        offset = 0;
        firstext = ei->i_first_extent;
@@ -998,8 +1036,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
                 * I/O errors.
                 */
                if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
-                        printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n",
+                        printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
-                                __func__, iblock, (unsigned long) inode->i_size);
+                                __func__, b_off,
+                                (unsigned long long)inode->i_size);
                        goto abort;
                }
@@ -1025,9 +1064,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
                        if (++section > 100) {
                                printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
                                        " aborting...\n", __func__);
-                                printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u "
+                                printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
                                        "nextblk=%lu nextoff=%lu\n", __func__,
-                                        iblock, firstext, (unsigned) sect_size,
+                                        b_off, firstext, (unsigned) sect_size,
                                        nextblk, nextoff);
                                goto abort;
                        }
@@ -1468,17 +1507,16 @@ struct inode *isofs_iget(struct super_block *sb,
        return inode;
 }
-static int isofs_get_sb(struct file_system_type *fs_type,
+static struct dentry *isofs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
-                                mnt);
 }
 static struct file_system_type iso9660_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "iso9660",
-        .get_sb         = isofs_get_sb,
+        .mount          = isofs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 05a38b9c4c0e..e4b87bc1fa56 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -221,7 +221,7 @@ restart:
                        goto restart;
                }
                if (buffer_locked(bh)) {
-                        atomic_inc(&bh->b_count);
+                        get_bh(bh);
                        spin_unlock(&journal->j_list_lock);
                        jbd_unlock_bh_state(bh);
                        wait_on_buffer(bh);
@@ -283,7 +283,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
        int ret = 0;
        if (buffer_locked(bh)) {
-                atomic_inc(&bh->b_count);
+                get_bh(bh);
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
                wait_on_buffer(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 85a6883c0aca..34a4861c14b8 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -587,13 +587,13 @@ void journal_commit_transaction(journal_t *journal)
                /* Bump b_count to prevent truncate from stumbling over
                   the shadowed buffer!  @@@ This can go if we ever get
                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
-                atomic_inc(&jh2bh(jh)->b_count);
+                get_bh(jh2bh(jh));
                /* Make a temporary IO buffer with which to write it out
                   (this will requeue both the metadata buffer and the
                   temporary IO buffer). new_bh goes on BJ_IO*/
-                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
+                set_buffer_jwrite(jh2bh(jh));
                /*
                 * akpm: journal_write_metadata_buffer() sets
                 * new_bh->b_transaction to commit_transaction.
@@ -603,7 +603,7 @@ void journal_commit_transaction(journal_t *journal)
                JBUFFER_TRACE(jh, "ph3: write metadata");
                flags = journal_write_metadata_buffer(commit_transaction,
                                                      jh, &new_jh, blocknr);
-                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+                set_buffer_jwrite(jh2bh(new_jh));
                wbuf[bufs++] = jh2bh(new_jh);
                /* Record the new block's tag in the current descriptor
@@ -713,7 +713,7 @@ wait_for_iobuf:
                   shadowed buffer */
                jh = commit_transaction->t_shadow_list->b_tprev;
                bh = jh2bh(jh);
-                clear_bit(BH_JWrite, &bh->b_state);
+                clear_buffer_jwrite(bh);
                J_ASSERT_BH(bh, buffer_jbddirty(bh));
                /* The metadata is now released for reuse, but we need
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 2c4b1f109da9..da1b5e4ffce1 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -36,6 +36,7 @@
 #include <linux/poison.h>
 #include <linux/proc_fs.h>
 #include <linux/debugfs.h>
+#include <linux/ratelimit.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -84,6 +85,7 @@ EXPORT_SYMBOL(journal_force_commit);
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
+static const char *journal_dev_name(journal_t *journal, char *buffer);
 /*
 * Helper function used to manage commit timeouts
@@ -439,7 +441,7 @@ int __log_start_commit(journal_t *journal, tid_t target)
         */
        if (!tid_geq(journal->j_commit_request, target)) {
                /*
-                 * We want a new commit: OK, mark the request and wakup the
+                 * We want a new commit: OK, mark the request and wakeup the
                 * commit thread.  We do _not_ do the commit ourselves.
                 */
@@ -950,6 +952,8 @@ int journal_create(journal_t *journal)
                if (err)
                        return err;
                bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+                if (unlikely(!bh))
+                        return -ENOMEM;
                lock_buffer(bh);
                memset (bh->b_data, 0, journal->j_blocksize);
                BUFFER_TRACE(bh, "marking dirty");
@@ -1010,6 +1014,23 @@ void journal_update_superblock(journal_t *journal, int wait)
                goto out;
        }
+        if (buffer_write_io_error(bh)) {
+                char b[BDEVNAME_SIZE];
+                /*
+                 * Oh, dear.  A previous attempt to write the journal
+                 * superblock failed.  This could happen because the
+                 * USB device was yanked out.  Or it could happen to
+                 * be a transient write error and maybe the block will
+                 * be remapped.  Nothing we can do but to retry the
+                 * write and hope for the best.
+                 */
+                printk(KERN_ERR "JBD: previous I/O error detected "
+                       "for journal superblock update for %s.\n",
+                       journal_dev_name(journal, b));
+                clear_buffer_write_io_error(bh);
+                set_buffer_uptodate(bh);
+        }
        spin_lock(&journal->j_state_lock);
        jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
                  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1021,9 +1042,17 @@ void journal_update_superblock(journal_t *journal, int wait)
        BUFFER_TRACE(bh, "marking dirty");
        mark_buffer_dirty(bh);
-        if (wait)
+        if (wait) {
                sync_dirty_buffer(bh);
-        else
+                if (buffer_write_io_error(bh)) {
+                        char b[BDEVNAME_SIZE];
+                        printk(KERN_ERR "JBD: I/O error detected "
+                               "when updating journal superblock for %s.\n",
+                               journal_dev_name(journal, b));
+                        clear_buffer_write_io_error(bh);
+                        set_buffer_uptodate(bh);
+                }
+        } else
                write_dirty_buffer(bh, WRITE);
 out:
@@ -1719,7 +1748,6 @@ static void journal_destroy_journal_head_cache(void)
 static struct journal_head *journal_alloc_journal_head(void)
 {
        struct journal_head *ret;
-        static unsigned long last_warning;
 #ifdef CONFIG_JBD_DEBUG
        atomic_inc(&nr_journal_heads);
@@ -1727,11 +1755,9 @@ static struct journal_head *journal_alloc_journal_head(void)
        ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
        if (ret == NULL) {
                jbd_debug(1, "out of memory for journal_head\n");
-                if (time_after(jiffies, last_warning + 5*HZ)) {
+                printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
-                        printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
+                                   __func__);
-                               __func__);
-                        last_warning = jiffies;
-                }
                while (ret == NULL) {
                        yield();
                        ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 81051dafebf5..5b43e96788e6 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -296,10 +296,10 @@ int journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD_DEBUG
                int dropped = info.end_transaction -
                              be32_to_cpu(journal->j_superblock->s_sequence);
-#endif
                jbd_debug(1,
                          "JBD: ignoring %d transaction%s from the journal.\n",
                          dropped, (dropped == 1) ? "" : "s");
+#endif
                journal->j_transaction_sequence = ++info.end_transaction;
        }
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 5ae71e75a491..846a3f314111 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -293,9 +293,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
                jbd_free_handle(handle);
                current->journal_info = NULL;
                handle = ERR_PTR(err);
-                goto out;
        }
-out:
        return handle;
 }
@@ -528,7 +526,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
        transaction = handle->h_transaction;
        journal = transaction->t_journal;
-        jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+        jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
        JBUFFER_TRACE(jh, "entry");
 repeat:
@@ -713,7 +711,7 @@ done:
                J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
                            "Possible IO failure.\n");
                page = jh2bh(jh)->b_page;
-                offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+                offset = offset_in_page(jh2bh(jh)->b_data);
                source = kmap_atomic(page, KM_USER0);
                memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
                kunmap_atomic(source, KM_USER0);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6571a056e55d..6a79fd0a1a32 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -299,6 +299,16 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                transaction->t_chp_stats.cs_forced_to_close++;
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
+                if (unlikely(journal->j_flags & JBD2_UNMOUNT))
+                        /*
+                         * The journal thread is dead; so starting and
+                         * waiting for a commit to finish will cause
+                         * us to wait for a _very_ long time.
+                         */
+                        printk(KERN_ERR "JBD2: %s: "
+                               "Waiting for Godot: block %llu\n",
+                               journal->j_devname,
+                               (unsigned long long) bh->b_blocknr);
                jbd2_log_start_commit(journal, tid);
                jbd2_log_wait_commit(journal, tid);
                ret = 1;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index bc6be8bda1cc..f3ad1598b201 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -26,7 +26,9 @@
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/bitops.h>
 #include <trace/events/jbd2.h>
+#include <asm/system.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -201,7 +203,7 @@ static int journal_submit_data_buffers(journal_t *journal,
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
                mapping = jinode->i_vfs_inode->i_mapping;
-                jinode->i_flags |= JI_COMMIT_RUNNING;
+                set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                spin_unlock(&journal->j_list_lock);
                /*
                 * submit the inode data buffers. We use writepage
@@ -216,7 +218,8 @@ static int journal_submit_data_buffers(journal_t *journal,
                spin_lock(&journal->j_list_lock);
                J_ASSERT(jinode->i_transaction == commit_transaction);
                commit_transaction->t_flushed_data_blocks = 1;
-                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+                smp_mb__after_clear_bit();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
        spin_unlock(&journal->j_list_lock);
@@ -237,7 +240,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
        /* For locking, see the comment in journal_submit_data_buffers() */
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-                jinode->i_flags |= JI_COMMIT_RUNNING;
+                set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                spin_unlock(&journal->j_list_lock);
                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
                if (err) {
@@ -253,7 +256,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
                                ret = err;
                }
                spin_lock(&journal->j_list_lock);
-                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+                smp_mb__after_clear_bit();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 262419f83d80..538417c1fdbb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -42,12 +42,14 @@
 #include <linux/log2.h>
 #include <linux/vmalloc.h>
 #include <linux/backing-dev.h>
+#include <linux/bitops.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
+#include <asm/system.h>
 EXPORT_SYMBOL(jbd2_journal_extend);
 EXPORT_SYMBOL(jbd2_journal_stop);
@@ -478,7 +480,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
         */
        if (!tid_geq(journal->j_commit_request, target)) {
                /*
-                 * We want a new commit: OK, mark the request and wakup the
+                 * We want a new commit: OK, mark the request and wakeup the
                 * commit thread.  We do _not_ do the commit ourselves.
                 */
@@ -2210,7 +2212,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal,
 restart:
        spin_lock(&journal->j_list_lock);
        /* Is commit writing out inode - we have to wait */
-        if (jinode->i_flags & JI_COMMIT_RUNNING) {
+        if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
                wait_queue_head_t *wq;
                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index f3479d6e0a83..6bf0a242613e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -156,6 +156,7 @@ alloc_transaction:
         */
 repeat:
        read_lock(&journal->j_state_lock);
+        BUG_ON(journal->j_flags & JBD2_UNMOUNT);
        if (is_journal_aborted(journal) ||
            (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
                read_unlock(&journal->j_state_lock);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index ed78a3cf3cb0..79121aa5858b 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -289,7 +289,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
                mutex_unlock(&f->sem);
                d_instantiate(dentry, old_dentry->d_inode);
                dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
-                atomic_inc(&old_dentry->d_inode->i_count);
+                ihold(old_dentry->d_inode);
        }
        return ret;
 }
@@ -864,7 +864,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
                printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret);
                /* Might as well let the VFS know */
                d_instantiate(new_dentry, old_dentry->d_inode);
-                atomic_inc(&old_dentry->d_inode->i_count);
+                ihold(old_dentry->d_inode);
                new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
                return ret;
        }
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index d1ae5dfc22b9..c86041b866a4 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -179,12 +179,11 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
        return ret;
 }
-static int jffs2_get_sb(struct file_system_type *fs_type,
+static struct dentry *jffs2_mount(struct file_system_type *fs_type,
                        int flags, const char *dev_name,
-                        void *data, struct vfsmount *mnt)
+                        void *data)
 {
-        return get_sb_mtd(fs_type, flags, dev_name, data, jffs2_fill_super,
+        return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super);
-                          mnt);
 }
 static void jffs2_put_super (struct super_block *sb)
@@ -229,7 +228,7 @@ static void jffs2_kill_sb(struct super_block *sb)
 static struct file_system_type jffs2_fs_type = {
        .owner =        THIS_MODULE,
        .name =         "jffs2",
-        .get_sb =       jffs2_get_sb,
+        .mount =        jffs2_mount,
        .kill_sb =      jffs2_kill_sb,
 };
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f8332dc8eeb2..3a09423b6c22 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -497,7 +497,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
         * appear hashed, but do not put on any lists.  hlist_del()
         * will work fine and require no locking.
         */
-        ip->i_hash.pprev = &ip->i_hash.next;
+        hlist_add_fake(&ip->i_hash);
        return (ip);
 }
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d945ea76b445..9466957ec841 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1279,7 +1279,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
         * lazy commit thread finishes processing
         */
        if (tblk->xflag & COMMIT_DELETE) {
-                atomic_inc(&tblk->u.ip->i_count);
+                ihold(tblk->u.ip);
                /*
                 * Avoid a rare deadlock
                 *
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a9cf8e8675be..231ca4af9bce 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -839,7 +839,7 @@ static int jfs_link(struct dentry *old_dentry,
        ip->i_ctime = CURRENT_TIME;
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        mark_inode_dirty(dir);
-        atomic_inc(&ip->i_count);
+        ihold(ip);
        iplist[0] = ip;
        iplist[1] = dir;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 68eee2bf629e..0669fc1cc3bf 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -583,11 +583,10 @@ static int jfs_unfreeze(struct super_block *sb)
        return 0;
 }
-static int jfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
-                           mnt);
 }
 static int jfs_sync_fs(struct super_block *sb, int wait)
@@ -770,7 +769,7 @@ static const struct export_operations jfs_export_operations = {
 static struct file_system_type jfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "jfs",
-        .get_sb         = jfs_get_sb,
+        .mount          = jfs_do_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/libfs.c b/fs/libfs.c
index 62baa0387d6e..a3accdf528ad 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -201,9 +201,8 @@ static const struct super_operations simple_super_operations = {
 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
 * will never be mountable)
 */
-int get_sb_pseudo(struct file_system_type *fs_type, char *name,
+struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
-        const struct super_operations *ops, unsigned long magic,
+        const struct super_operations *ops, unsigned long magic)
-        struct vfsmount *mnt)
 {
        struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
        struct dentry *dentry;
@@ -211,7 +210,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
        struct qstr d_name = {.name = name, .len = strlen(name)};
        if (IS_ERR(s))
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        s->s_flags = MS_NOUSER;
        s->s_maxbytes = MAX_LFS_FILESIZE;
@@ -241,12 +240,11 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
        d_instantiate(dentry, root);
        s->s_root = dentry;
        s->s_flags |= MS_ACTIVE;
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 Enomem:
        deactivate_locked_super(s);
-        return -ENOMEM;
+        return ERR_PTR(-ENOMEM);
 }
 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
@@ -255,7 +253,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        dget(dentry);
        d_instantiate(dentry, inode);
        return 0;
@@ -892,10 +890,6 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent);
 */
 int generic_file_fsync(struct file *file, int datasync)
 {
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 0, /* metadata-only; caller takes care of data */
-        };
        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;
@@ -906,7 +900,7 @@ int generic_file_fsync(struct file *file, int datasync)
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                return ret;
-        err = sync_inode(inode, &wbc);
+        err = sync_inode_metadata(inode, 1);
        if (ret == 0)
                ret = err;
        return ret;
@@ -955,7 +949,7 @@ EXPORT_SYMBOL(dcache_dir_lseek);
 EXPORT_SYMBOL(dcache_dir_open);
 EXPORT_SYMBOL(dcache_readdir);
 EXPORT_SYMBOL(generic_read_dir);
-EXPORT_SYMBOL(get_sb_pseudo);
+EXPORT_SYMBOL(mount_pseudo);
 EXPORT_SYMBOL(simple_write_begin);
 EXPORT_SYMBOL(simple_write_end);
 EXPORT_SYMBOL(simple_dir_inode_operations);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 64fd427c993c..d5bb86866e6c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -42,6 +42,7 @@ struct nlm_wait {
 };
 static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
 /**
 * nlmclnt_init - Set up per-NFS mount point lockd data structures
@@ -97,7 +98,10 @@ struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *
                block->b_lock = fl;
                init_waitqueue_head(&block->b_wait);
                block->b_status = nlm_lck_blocked;
+                spin_lock(&nlm_blocked_lock);
                list_add(&block->b_list, &nlm_blocked);
+                spin_unlock(&nlm_blocked_lock);
        }
        return block;
 }
@@ -106,7 +110,9 @@ void nlmclnt_finish_block(struct nlm_wait *block)
 {
        if (block == NULL)
                return;
+        spin_lock(&nlm_blocked_lock);
        list_del(&block->b_list);
+        spin_unlock(&nlm_blocked_lock);
        kfree(block);
 }
@@ -154,6 +160,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
         * Look up blocked request based on arguments. 
         * Warning: must not use cookie to match it!
         */
+        spin_lock(&nlm_blocked_lock);
        list_for_each_entry(block, &nlm_blocked, b_list) {
                struct file_lock *fl_blocked = block->b_lock;
@@ -178,6 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
                wake_up(&block->b_wait);
                res = nlm_granted;
        }
+        spin_unlock(&nlm_blocked_lock);
        return res;
 }
@@ -216,10 +224,6 @@ reclaimer(void *ptr)
        allow_signal(SIGKILL);
        down_write(&host->h_rwsem);
-        /* This one ensures that our parent doesn't terminate while the
-         * reclaim is in progress */
-        lock_kernel();
        lockd_up();     /* note: this cannot fail as lockd is already running */
        dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
@@ -260,16 +264,17 @@ restart:
        dprintk("NLM: done reclaiming locks for host %s\n", host->h_name);
        /* Now, wake up all processes that sleep on a blocked lock */
+        spin_lock(&nlm_blocked_lock);
        list_for_each_entry(block, &nlm_blocked, b_list) {
                if (block->b_host == host) {
                        block->b_status = nlm_lck_denied_grace_period;
                        wake_up(&block->b_wait);
                }
        }
+        spin_unlock(&nlm_blocked_lock);
        /* Release host handle after use */
        nlm_release_host(host);
        lockd_down();
-        unlock_kernel();
        return 0;
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 7932c399fab4..47ea1e1925b8 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -166,7 +166,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
        /* Set up the argument struct */
        nlmclnt_setlockargs(call, fl);
-        lock_kernel();
        if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
                if (fl->fl_type != F_UNLCK) {
                        call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -177,10 +176,8 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
                status = nlmclnt_test(call, fl);
        else
                status = -EINVAL;
        fl->fl_ops->fl_release_private(fl);
        fl->fl_ops = NULL;
-        unlock_kernel();
        dprintk("lockd: clnt proc returns %d\n", status);
        return status;
@@ -226,9 +223,7 @@ void nlm_release_call(struct nlm_rqst *call)
 static void nlmclnt_rpc_release(void *data)
 {
-        lock_kernel();
        nlm_release_call(data);
-        unlock_kernel();
 }
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -448,14 +443,18 @@ out:
 static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
 {
+        spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
        new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state;
        new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner);
        list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted);
+        spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 }
 static void nlmclnt_locks_release_private(struct file_lock *fl)
 {
+        spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
        list_del(&fl->fl_u.nfs_fl.list);
+        spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
        nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
 }
@@ -721,9 +720,7 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 die:
        return;
 retry_rebind:
-        lock_kernel();
        nlm_rebind_host(req->a_host);
-        unlock_kernel();
 retry_unlock:
        rpc_restart_call(task);
 }
@@ -801,9 +798,7 @@ retry_cancel:
        /* Don't ever retry more than 3 times */
        if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
                goto die;
-        lock_kernel();
        nlm_rebind_host(req->a_host);
-        unlock_kernel();
        rpc_restart_call(task);
        rpc_delay(task, 30 * HZ);
 }
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index bb464d12104c..25e21e4023b2 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -353,6 +353,7 @@ nlm_bind_host(struct nlm_host *host)
                        .to_retries     = 5U,
                };
                struct rpc_create_args args = {
+                        .net            = &init_net,
                        .protocol       = host->h_proto,
                        .address        = nlm_addr(host),
                        .addrsize       = host->h_addrlen,
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e3015464fbab..e0c918949644 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -69,6 +69,7 @@ static struct rpc_clnt *nsm_create(void)
                .sin_addr.s_addr        = htonl(INADDR_LOOPBACK),
        };
        struct rpc_create_args args = {
+                .net                    = &init_net,
                .protocol               = XPRT_TRANSPORT_UDP,
                .address                = (struct sockaddr *)&sin,
                .addrsize               = sizeof(sin),
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index f1bacf1a0391..abfff9d7979d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -22,7 +22,6 @@
 #include <linux/in.h>
 #include <linux/uio.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/mutex.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -130,15 +129,6 @@ lockd(void *vrqstp)
        dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
-        /*
-         * FIXME: it would be nice if lockd didn't spend its entire life
-         * running under the BKL. At the very least, it would be good to
-         * have someone clarify what it's intended to protect here. I've
-         * seen some handwavy posts about posix locking needing to be
-         * done under the BKL, but it's far from clear.
-         */
-        lock_kernel();
        if (!nlm_timeout)
                nlm_timeout = LOCKD_DFLT_TIMEO;
        nlmsvc_timeout = nlm_timeout * HZ;
@@ -195,7 +185,6 @@ lockd(void *vrqstp)
        if (nlmsvc_ops)
                nlmsvc_invalidate_all();
        nlm_shutdown_hosts();
-        unlock_kernel();
        return 0;
 }
@@ -206,7 +195,7 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
        xprt = svc_find_xprt(serv, name, family, 0);
        if (xprt == NULL)
-                return svc_create_xprt(serv, name, family, port,
+                return svc_create_xprt(serv, name, &init_net, family, port,
                                                SVC_SOCK_DEFAULTS);
        svc_xprt_put(xprt);
        return 0;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 031c6569a134..a336e832475d 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -230,9 +230,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 static void nlm4svc_callback_release(void *data)
 {
-        lock_kernel();
        nlm_release_call(data);
-        unlock_kernel();
 }
 static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 84055d31bfc5..c462d346acbd 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -52,12 +52,13 @@ static const struct rpc_call_ops nlmsvc_grant_ops;
 * The list of blocked locks to retry
 */
 static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
 /*
 * Insert a blocked lock into the global list
 */
 static void
-nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
+nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when)
 {
        struct nlm_block *b;
        struct list_head *pos;
@@ -87,6 +88,13 @@ nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
        block->b_when = when;
 }
+static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
+{
+        spin_lock(&nlm_blocked_lock);
+        nlmsvc_insert_block_locked(block, when);
+        spin_unlock(&nlm_blocked_lock);
+}
 /*
 * Remove a block from the global list
 */
@@ -94,7 +102,9 @@ static inline void
 nlmsvc_remove_block(struct nlm_block *block)
 {
        if (!list_empty(&block->b_list)) {
+                spin_lock(&nlm_blocked_lock);
                list_del_init(&block->b_list);
+                spin_unlock(&nlm_blocked_lock);
                nlmsvc_release_block(block);
        }
 }
@@ -651,7 +661,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
        struct nlm_block *block;
        int rc = -ENOENT;
-        lock_kernel();
+        spin_lock(&nlm_blocked_lock);
        list_for_each_entry(block, &nlm_blocked, b_list) {
                if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
                        dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n",
@@ -665,13 +675,13 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
                        } else if (result == 0)
                                block->b_granted = 1;
-                        nlmsvc_insert_block(block, 0);
+                        nlmsvc_insert_block_locked(block, 0);
                        svc_wake_up(block->b_daemon);
                        rc = 0;
                        break;
                }
        }
-        unlock_kernel();
+        spin_unlock(&nlm_blocked_lock);
        if (rc == -ENOENT)
                printk(KERN_WARNING "lockd: grant for unknown block\n");
        return rc;
@@ -690,14 +700,16 @@ nlmsvc_notify_blocked(struct file_lock *fl)
        struct nlm_block        *block;
        dprintk("lockd: VFS unblock notification for block %p\n", fl);
+        spin_lock(&nlm_blocked_lock);
        list_for_each_entry(block, &nlm_blocked, b_list) {
                if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
-                        nlmsvc_insert_block(block, 0);
+                        nlmsvc_insert_block_locked(block, 0);
+                        spin_unlock(&nlm_blocked_lock);
                        svc_wake_up(block->b_daemon);
                        return;
                }
        }
+        spin_unlock(&nlm_blocked_lock);
        printk(KERN_WARNING "lockd: notification for unknown block!\n");
 }
@@ -803,7 +815,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
        dprintk("lockd: GRANT_MSG RPC callback\n");
-        lock_kernel();
+        spin_lock(&nlm_blocked_lock);
        /* if the block is not on a list at this point then it has
         * been invalidated. Don't try to requeue it.
         *
@@ -825,19 +837,20 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
                /* Call was successful, now wait for client callback */
                timeout = 60 * HZ;
        }
-        nlmsvc_insert_block(block, timeout);
+        nlmsvc_insert_block_locked(block, timeout);
        svc_wake_up(block->b_daemon);
 out:
-        unlock_kernel();
+        spin_unlock(&nlm_blocked_lock);
 }
+/*
+ * FIXME: nlmsvc_release_block() grabs a mutex.  This is not allowed for an
+ * .rpc_release rpc_call_op
+ */
 static void nlmsvc_grant_release(void *data)
 {
        struct nlm_rqst         *call = data;
-        lock_kernel();
        nlmsvc_release_block(call->a_block);
-        unlock_kernel();
 }
 static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0f2ab741ae7c..c3069f38d602 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -260,9 +260,7 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
 static void nlmsvc_callback_release(void *data)
 {
-        lock_kernel();
        nlm_release_call(data);
-        unlock_kernel();
 }
 static const struct rpc_call_ops nlmsvc_callback_ops = {
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d0ef94cfb3da..1ca0679c80bf 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -170,6 +170,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 again:
        file->f_locks = 0;
+        lock_flocks(); /* protects i_flock list */
        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
                if (fl->fl_lmops != &nlmsvc_lock_operations)
                        continue;
@@ -181,6 +182,7 @@ again:
                if (match(lockhost, host)) {
                        struct file_lock lock = *fl;
+                        unlock_flocks();
                        lock.fl_type  = F_UNLCK;
                        lock.fl_start = 0;
                        lock.fl_end   = OFFSET_MAX;
@@ -192,6 +194,7 @@ again:
                        goto again;
                }
        }
+        unlock_flocks();
        return 0;
 }
@@ -226,10 +229,14 @@ nlm_file_inuse(struct nlm_file *file)
        if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
                return 1;
+        lock_flocks();
        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
-                if (fl->fl_lmops == &nlmsvc_lock_operations)
+                if (fl->fl_lmops == &nlmsvc_lock_operations) {
+                        unlock_flocks();
                        return 1;
+                }
        }
+        unlock_flocks();
        file->f_locks = 0;
        return 0;
 }
diff --git a/fs/locks.c b/fs/locks.c
index 8b2b6ad56a09..50ec15927aab 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -142,6 +142,7 @@ int lease_break_time = 45;
 static LIST_HEAD(file_lock_list);
 static LIST_HEAD(blocked_list);
+static DEFINE_SPINLOCK(file_lock_lock);
 /*
 * Protects the two list heads above, plus the inode->i_flock list
@@ -149,23 +150,24 @@ static LIST_HEAD(blocked_list);
 */
 void lock_flocks(void)
 {
-        lock_kernel();
+        spin_lock(&file_lock_lock);
 }
 EXPORT_SYMBOL_GPL(lock_flocks);
 void unlock_flocks(void)
 {
-        unlock_kernel();
+        spin_unlock(&file_lock_lock);
 }
 EXPORT_SYMBOL_GPL(unlock_flocks);
 static struct kmem_cache *filelock_cache __read_mostly;
 /* Allocate an empty lock structure. */
-static struct file_lock *locks_alloc_lock(void)
+struct file_lock *locks_alloc_lock(void)
 {
        return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
 }
+EXPORT_SYMBOL_GPL(locks_alloc_lock);
 void locks_release_private(struct file_lock *fl)
 {
@@ -1365,7 +1367,6 @@ int fcntl_getlease(struct file *filp)
 int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 {
        struct file_lock *fl, **before, **my_before = NULL, *lease;
-        struct file_lock *new_fl = NULL;
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        int error, rdlease_count = 0, wrlease_count = 0;
@@ -1385,11 +1386,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
        lease = *flp;
        if (arg != F_UNLCK) {
-                error = -ENOMEM;
-                new_fl = locks_alloc_lock();
-                if (new_fl == NULL)
-                        goto out;
                error = -EAGAIN;
                if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
                        goto out;
@@ -1434,7 +1430,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
                goto out;
        }
-        error = 0;
        if (arg == F_UNLCK)
                goto out;
@@ -1442,15 +1437,11 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
        if (!leases_enable)
                goto out;
-        locks_copy_lock(new_fl, lease);
+        locks_insert_lock(before, lease);
-        locks_insert_lock(before, new_fl);
-        *flp = new_fl;
        return 0;
 out:
-        if (new_fl != NULL)
+        locks_free_lock(lease);
-                locks_free_lock(new_fl);
        return error;
 }
 EXPORT_SYMBOL(generic_setlease);
@@ -1514,26 +1505,38 @@ EXPORT_SYMBOL_GPL(vfs_setlease);
 */
 int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 {
-        struct file_lock fl, *flp = &fl;
+        struct file_lock *fl;
+        struct fasync_struct *new;
        struct inode *inode = filp->f_path.dentry->d_inode;
        int error;
-        locks_init_lock(&fl);
+        fl = lease_alloc(filp, arg);
-        error = lease_init(filp, arg, &fl);
+        if (IS_ERR(fl))
-        if (error)
+                return PTR_ERR(fl);
-                return error;
+        new = fasync_alloc();
+        if (!new) {
+                locks_free_lock(fl);
+                return -ENOMEM;
+        }
        lock_flocks();
+        error = __vfs_setlease(filp, arg, &fl);
-        error = __vfs_setlease(filp, arg, &flp);
        if (error || arg == F_UNLCK)
                goto out_unlock;
-        error = fasync_helper(fd, filp, 1, &flp->fl_fasync);
+        /*
+         * fasync_insert_entry() returns the old entry if any.
+         * If there was no old entry, then it used 'new' and
+         * inserted it into the fasync list. Clear new so that
+         * we don't release it here.
+         */
+        if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new))
+                new = NULL;
        if (error < 0) {
                /* remove lease just inserted by setlease */
-                flp->fl_type = F_UNLCK | F_INPROGRESS;
+                fl->fl_type = F_UNLCK | F_INPROGRESS;
-                flp->fl_break_time = jiffies - 10;
+                fl->fl_break_time = jiffies - 10;
                time_out_leases(inode);
                goto out_unlock;
        }
@@ -1541,6 +1544,8 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
 out_unlock:
        unlock_flocks();
+        if (new)
+                fasync_free(new);
        return error;
 }
@@ -2109,7 +2114,7 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 #include <linux/seq_file.h>
 static void lock_get_status(struct seq_file *f, struct file_lock *fl,
-                                                        int id, char *pfx)
+                            loff_t id, char *pfx)
 {
        struct inode *inode = NULL;
        unsigned int fl_pid;
@@ -2122,7 +2127,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
        if (fl->fl_file != NULL)
                inode = fl->fl_file->f_path.dentry->d_inode;
-        seq_printf(f, "%d:%s ", id, pfx);
+        seq_printf(f, "%lld:%s ", id, pfx);
        if (IS_POSIX(fl)) {
                seq_printf(f, "%6s %s ",
                             (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
@@ -2185,24 +2190,27 @@ static int locks_show(struct seq_file *f, void *v)
        fl = list_entry(v, struct file_lock, fl_link);
-        lock_get_status(f, fl, (long)f->private, "");
+        lock_get_status(f, fl, *((loff_t *)f->private), "");
        list_for_each_entry(bfl, &fl->fl_block, fl_block)
-                lock_get_status(f, bfl, (long)f->private, " ->");
+                lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
-        f->private++;
        return 0;
 }
 static void *locks_start(struct seq_file *f, loff_t *pos)
 {
+        loff_t *p = f->private;
        lock_flocks();
-        f->private = (void *)1;
+        *p = (*pos + 1);
        return seq_list_start(&file_lock_list, *pos);
 }
 static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 {
+        loff_t *p = f->private;
+        ++*p;
        return seq_list_next(v, &file_lock_list, pos);
 }
@@ -2220,14 +2228,14 @@ static const struct seq_operations locks_seq_operations = {
 static int locks_open(struct inode *inode, struct file *filp)
 {
-        return seq_open(filp, &locks_seq_operations);
+        return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
 }
 static const struct file_operations proc_locks_operations = {
        .open           = locks_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release,
+        .release        = seq_release_private,
 };
 static int __init proc_locks_init(void)
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9bd2ce2a3040..92ca6fbe09bd 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -298,9 +298,9 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
        return sync_request(page, bdev, WRITE);
 }
-static void bdev_put_device(struct super_block *sb)
+static void bdev_put_device(struct logfs_super *s)
 {
-        close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
+        close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
 }
 static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -320,8 +320,8 @@ static const struct logfs_device_ops bd_devops = {
        .put_device     = bdev_put_device,
 };
-int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
-                const char *devname, struct vfsmount *mnt)
+                const char *devname)
 {
        struct block_device *bdev;
@@ -332,8 +332,11 @@ int logfs_get_sb_bdev(struct file_system_type *type, int flags,
        if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
                int mtdnr = MINOR(bdev->bd_dev);
                close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
-                return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+                return logfs_get_sb_mtd(p, mtdnr);
        }
-        return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
+        p->s_bdev = bdev;
+        p->s_mtd = NULL;
+        p->s_devops = &bd_devops;
+        return 0;
 }
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index a85d47d13e4b..7466e9dcc8c5 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -230,9 +230,9 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
        __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
 }
-static void mtd_put_device(struct super_block *sb)
+static void mtd_put_device(struct logfs_super *s)
 {
-        put_mtd_device(logfs_super(sb)->s_mtd);
+        put_mtd_device(s->s_mtd);
 }
 static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
@@ -265,14 +265,14 @@ static const struct logfs_device_ops mtd_devops = {
        .put_device     = mtd_put_device,
 };
-int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
-                int mtdnr, struct vfsmount *mnt)
 {
-        struct mtd_info *mtd;
+        struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
-        const struct logfs_device_ops *devops = &mtd_devops;
-        mtd = get_mtd_device(NULL, mtdnr);
        if (IS_ERR(mtd))
                return PTR_ERR(mtd);
-        return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
+        s->s_bdev = NULL;
+        s->s_mtd = mtd;
+        s->s_devops = &mtd_devops;
+        return 0;
 }
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 1eb4e89e045b..409dfd65e9a1 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -569,7 +569,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
                return -EMLINK;
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        inode->i_nlink++;
        mark_inode_dirty_sync(inode);
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index b8786264d243..cd51a36b37f0 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -136,6 +136,7 @@ struct logfs_area_ops {
        int     (*erase_segment)(struct logfs_area *area);
 };
+struct logfs_super;     /* forward */
 /**
 * struct logfs_device_ops - device access operations
 *
@@ -156,7 +157,7 @@ struct logfs_device_ops {
                        int ensure_write);
        int (*can_write_buf)(struct super_block *sb, u64 ofs);
        void (*sync)(struct super_block *sb);
-        void (*put_device)(struct super_block *sb);
+        void (*put_device)(struct logfs_super *s);
 };
 /**
@@ -471,11 +472,13 @@ void logfs_compr_exit(void);
 /* dev_bdev.c */
 #ifdef CONFIG_BLOCK
-int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+int logfs_get_sb_bdev(struct logfs_super *s,
-                const char *devname, struct vfsmount *mnt);
+                struct file_system_type *type,
+                const char *devname);
 #else
-static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+static inline int logfs_get_sb_bdev(struct logfs_super *s,
-                const char *devname, struct vfsmount *mnt)
+                struct file_system_type *type,
+                const char *devname)
 {
        return -ENODEV;
 }
@@ -483,11 +486,9 @@ static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
 /* dev_mtd.c */
 #ifdef CONFIG_MTD
-int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
-                int mtdnr, struct vfsmount *mnt);
 #else
-static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
-                int mtdnr, struct vfsmount *mnt)
 {
        return -ENODEV;
 }
@@ -619,9 +620,6 @@ void emergency_read_end(struct page *page);
 void logfs_crash_dump(struct super_block *sb);
 void *memchr_inv(const void *s, int c, size_t n);
 int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
-int logfs_get_sb_device(struct file_system_type *type, int flags,
-                struct mtd_info *mtd, struct block_device *bdev,
-                const struct logfs_device_ops *devops, struct vfsmount *mnt);
 int logfs_check_ds(struct logfs_disk_super *ds);
 int logfs_write_sb(struct super_block *sb);
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 5336155c5d81..33435e4b14d2 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -325,7 +325,7 @@ static int logfs_make_writeable(struct super_block *sb)
        return 0;
 }
-static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
+static int logfs_get_sb_final(struct super_block *sb)
 {
        struct logfs_super *super = logfs_super(sb);
        struct inode *rootdir;
@@ -356,7 +356,6 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
        }
        log_super("LogFS: Finished mounting\n");
-        simple_set_mnt(mnt, sb);
        return 0;
 fail:
@@ -529,43 +528,37 @@ static void logfs_kill_sb(struct super_block *sb)
        logfs_cleanup_rw(sb);
        if (super->s_erase_page)
                __free_page(super->s_erase_page);
-        super->s_devops->put_device(sb);
+        super->s_devops->put_device(super);
        logfs_mempool_destroy(super->s_btree_pool);
        logfs_mempool_destroy(super->s_alias_pool);
        kfree(super);
        log_super("LogFS: Finished unmounting\n");
 }
-int logfs_get_sb_device(struct file_system_type *type, int flags,
+static struct dentry *logfs_get_sb_device(struct logfs_super *super,
-                struct mtd_info *mtd, struct block_device *bdev,
+                struct file_system_type *type, int flags)
-                const struct logfs_device_ops *devops, struct vfsmount *mnt)
 {
-        struct logfs_super *super;
        struct super_block *sb;
        int err = -ENOMEM;
        static int mount_count;
        log_super("LogFS: Start mount %x\n", mount_count++);
-        super = kzalloc(sizeof(*super), GFP_KERNEL);
-        if (!super)
-                goto err0;
-        super->s_mtd    = mtd;
-        super->s_bdev   = bdev;
        err = -EINVAL;
        sb = sget(type, logfs_sb_test, logfs_sb_set, super);
-        if (IS_ERR(sb))
+        if (IS_ERR(sb)) {
-                goto err0;
+                super->s_devops->put_device(super);
+                kfree(super);
+                return ERR_CAST(sb);
+        }
        if (sb->s_root) {
                /* Device is already in use */
-                err = 0;
+                super->s_devops->put_device(super);
-                simple_set_mnt(mnt, sb);
+                kfree(super);
-                goto err0;
+                return dget(sb->s_root);
        }
-        super->s_devops = devops;
        /*
         * sb->s_maxbytes is limited to 8TB.  On 32bit systems, the page cache
         * only covers 16TB and the upper 8TB are used for indirect blocks.
@@ -581,10 +574,12 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
                goto err1;
        sb->s_flags |= MS_ACTIVE;
-        err = logfs_get_sb_final(sb, mnt);
+        err = logfs_get_sb_final(sb);
-        if (err)
+        if (err) {
                deactivate_locked_super(sb);
-        return err;
+                return ERR_PTR(err);
+        }
+        return dget(sb->s_root);
 err1:
        /* no ->s_root, no ->put_super() */
@@ -592,37 +587,45 @@ err1:
        iput(super->s_segfile_inode);
        iput(super->s_mapping_inode);
        deactivate_locked_super(sb);
-        return err;
+        return ERR_PTR(err);
-err0:
-        kfree(super);
-        //devops->put_device(sb);
-        return err;
 }
-static int logfs_get_sb(struct file_system_type *type, int flags,
+static struct dentry *logfs_mount(struct file_system_type *type, int flags,
-                const char *devname, void *data, struct vfsmount *mnt)
+                const char *devname, void *data)
 {
        ulong mtdnr;
+        struct logfs_super *super;
+        int err;
-        if (!devname)
+        super = kzalloc(sizeof(*super), GFP_KERNEL);
-                return logfs_get_sb_bdev(type, flags, devname, mnt);
+        if (!super)
-        if (strncmp(devname, "mtd", 3))
+                return ERR_PTR(-ENOMEM);
-                return logfs_get_sb_bdev(type, flags, devname, mnt);
-        {
+        if (!devname)
+                err = logfs_get_sb_bdev(super, type, devname);
+        else if (strncmp(devname, "mtd", 3))
+                err = logfs_get_sb_bdev(super, type, devname);
+        else {
                char *garbage;
                mtdnr = simple_strtoul(devname+3, &garbage, 0);
                if (*garbage)
-                        return -EINVAL;
+                        err = -EINVAL;
+                else
+                        err = logfs_get_sb_mtd(super, mtdnr);
+        }
+        if (err) {
+                kfree(super);
+                return ERR_PTR(err);
        }
-        return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+        return logfs_get_sb_device(super, type, flags);
 }
 static struct file_system_type logfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "logfs",
-        .get_sb         = logfs_get_sb,
+        .mount          = logfs_mount,
        .kill_sb        = logfs_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV,
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e39d6bf2e8fb..fb2020858a34 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -614,17 +614,16 @@ void minix_truncate(struct inode * inode)
                V2_minix_truncate(inode);
 }
-static int minix_get_sb(struct file_system_type *fs_type,
+static struct dentry *minix_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
-                           mnt);
 }
 static struct file_system_type minix_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "minix",
-        .get_sb         = minix_get_sb,
+        .mount          = minix_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f3f3578393a4..c0d35a3accef 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -101,7 +101,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        return add_nondir(dentry, inode);
 }
diff --git a/fs/namei.c b/fs/namei.c
index 24896e833565..5362af9b7372 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1121,11 +1121,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 static struct dentry *__lookup_hash(struct qstr *name,
                struct dentry *base, struct nameidata *nd)
 {
+        struct inode *inode = base->d_inode;
        struct dentry *dentry;
-        struct inode *inode;
        int err;
-        inode = base->d_inode;
+        err = exec_permission(inode);
+        if (err)
+                return ERR_PTR(err);
        /*
         * See if the low-level filesystem might want
@@ -1161,11 +1163,6 @@ out:
 */
 static struct dentry *lookup_hash(struct nameidata *nd)
 {
-        int err;
-        err = exec_permission(nd->path.dentry->d_inode);
-        if (err)
-                return ERR_PTR(err);
        return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
@@ -1213,9 +1210,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        if (err)
                return ERR_PTR(err);
-        err = exec_permission(base->d_inode);
-        if (err)
-                return ERR_PTR(err);
        return __lookup_hash(&this, base, NULL);
 }
@@ -1580,6 +1574,7 @@ static struct file *finish_open(struct nameidata *nd,
         */
        if (will_truncate)
                mnt_drop_write(nd->path.mnt);
+        path_put(&nd->path);
        return filp;
 exit:
@@ -1681,6 +1676,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                }
                filp = nameidata_to_filp(nd);
                mnt_drop_write(nd->path.mnt);
+                path_put(&nd->path);
                if (!IS_ERR(filp)) {
                        error = ima_file_check(filp, acc_mode);
                        if (error) {
@@ -2291,7 +2287,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                        goto slashes;
                inode = dentry->d_inode;
                if (inode)
-                        atomic_inc(&inode->i_count);
+                        ihold(inode);
                error = mnt_want_write(nd.path.mnt);
                if (error)
                        goto exit2;
diff --git a/fs/namespace.c b/fs/namespace.c
index 7ca5182c0bed..8a415c9c5e55 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -595,7 +595,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
                                goto out_free;
                }
-                mnt->mnt_flags = old->mnt_flags;
+                mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
                atomic_inc(&sb->s_active);
                mnt->mnt_sb = sb;
                mnt->mnt_root = dget(root);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 985fabb26aca..d290545aa0c4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -1020,16 +1020,16 @@ out:
        return result;
 }
-static int ncp_get_sb(struct file_system_type *fs_type,
+static struct dentry *ncp_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, ncp_fill_super);
 }
 static struct file_system_type ncp_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ncpfs",
-        .get_sb         = ncp_get_sb,
+        .mount          = ncp_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = FS_BINARY_MOUNTDATA,
 };
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index b950415d7c43..ba306658a6db 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -1,7 +1,6 @@
 config NFS_FS
        tristate "NFS client support"
        depends on INET && FILE_LOCKING
-        depends on BKL # fix as soon as lockd is done
        select LOCKD
        select SUNRPC
        select NFS_ACL_SUPPORT if NFS_V3_ACL
@@ -77,13 +76,17 @@ config NFS_V4
 config NFS_V4_1
        bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
-        depends on NFS_V4 && EXPERIMENTAL
+        depends on NFS_FS && NFS_V4 && EXPERIMENTAL
+        select PNFS_FILE_LAYOUT
        help
          This option enables support for minor version 1 of the NFSv4 protocol
-          (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
+          (RFC 5661) in the kernel's NFS client.
          If unsure, say N.
+config PNFS_FILE_LAYOUT
+        tristate
 config ROOT_NFS
        bool "Root file system on NFS"
        depends on NFS_FS=y && IP_PNP
@@ -118,3 +121,14 @@ config NFS_USE_KERNEL_DNS
        select DNS_RESOLVER
        select KEYS
        default y
+config NFS_USE_NEW_IDMAPPER
+        bool "Use the new idmapper upcall routine"
+        depends on NFS_V4 && KEYS
+        help
+          Say Y here if you want NFS to use the new idmapper upcall functions.
+          You will need /sbin/request-key (usually provided by the keyutils
+          package).  For details, read
+          <file:Documentation/filesystems/nfs/idmapper.txt>.
+          If you are unsure, say N.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index da7fda639eac..4776ff9e3814 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,5 +15,9 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
                           delegation.o idmap.o \
                           callback.o callback_xdr.o callback_proc.o \
                           nfs4namespace.o
+nfs-$(CONFIG_NFS_V4_1)  += pnfs.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e17b49e2eabd..aeec017fe814 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -109,7 +109,7 @@ nfs4_callback_up(struct svc_serv *serv)
 {
        int ret;
-        ret = svc_create_xprt(serv, "tcp", PF_INET,
+        ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
        if (ret <= 0)
                goto out_err;
@@ -117,7 +117,7 @@ nfs4_callback_up(struct svc_serv *serv)
        dprintk("NFS: Callback listener port = %u (af %u)\n",
                        nfs_callback_tcpport, PF_INET);
-        ret = svc_create_xprt(serv, "tcp", PF_INET6,
+        ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
        if (ret > 0) {
                nfs_callback_tcpport6 = ret;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 930d10fecdaf..2950fca0c61b 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -118,11 +118,11 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n
        if (delegation == NULL)
                return 0;
-        /* seqid is 4-bytes long */
+        if (stateid->stateid.seqid != 0)
-        if (((u32 *) &stateid->data)[0] != 0)
                return 0;
-        if (memcmp(&delegation->stateid.data[4], &stateid->data[4],
+        if (memcmp(&delegation->stateid.stateid.other,
-                   sizeof(stateid->data)-4))
+                   &stateid->stateid.other,
+                   NFS4_STATEID_OTHER_SIZE))
                return 0;
        return 1;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e7340729af89..0870d0d4efc0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -48,6 +48,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_CLIENT
@@ -155,7 +156,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        cred = rpc_lookup_machine_cred();
        if (!IS_ERR(cred))
                clp->cl_machine_cred = cred;
+#if defined(CONFIG_NFS_V4_1)
+        INIT_LIST_HEAD(&clp->cl_layouts);
+#endif
        nfs_fscache_get_client_cookie(clp);
        return clp;
@@ -252,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp)
                nfs_free_client(clp);
        }
 }
+EXPORT_SYMBOL_GPL(nfs_put_client);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 /*
@@ -601,6 +605,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 {
        struct rpc_clnt         *clnt = NULL;
        struct rpc_create_args args = {
+                .net            = &init_net,
                .protocol       = clp->cl_proto,
                .address        = (struct sockaddr *)&clp->cl_addr,
                .addrsize       = clp->cl_addrlen,
@@ -635,7 +640,8 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 */
 static void nfs_destroy_server(struct nfs_server *server)
 {
-        if (!(server->flags & NFS_MOUNT_NONLM))
+        if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) ||
+                        !(server->flags & NFS_MOUNT_LOCAL_FCNTL))
                nlmclnt_done(server->nlm_host);
 }
@@ -657,7 +663,8 @@ static int nfs_start_lockd(struct nfs_server *server)
        if (nlm_init.nfs_version > 3)
                return 0;
-        if (server->flags & NFS_MOUNT_NONLM)
+        if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
+                        (server->flags & NFS_MOUNT_LOCAL_FCNTL))
                return 0;
        switch (clp->cl_proto) {
@@ -898,11 +905,13 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
                server->wsize = NFS_MAX_FILE_IO_SIZE;
        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        set_pnfs_layoutdriver(server, fsinfo->layouttype);
        server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
        server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
-        if (server->dtsize > PAGE_CACHE_SIZE)
+        if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
-                server->dtsize = PAGE_CACHE_SIZE;
+                server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
        if (server->dtsize > server->rsize)
                server->dtsize = server->rsize;
@@ -913,6 +922,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
        server->maxfilesize = fsinfo->maxfilesize;
+        server->time_delta = fsinfo->time_delta;
        /* We're airborne Set socket buffersize */
        rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
 }
@@ -935,6 +946,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
        }
        fsinfo.fattr = fattr;
+        fsinfo.layouttype = 0;
        error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
        if (error < 0)
                goto out_error;
@@ -1017,6 +1029,7 @@ void nfs_free_server(struct nfs_server *server)
 {
        dprintk("--> nfs_free_server()\n");
+        unset_pnfs_layoutdriver(server);
        spin_lock(&nfs_client_lock);
        list_del(&server->client_link);
        list_del(&server->master_link);
@@ -1356,8 +1369,9 @@ static int nfs4_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
-        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
+        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK;
-                NFS_CAP_POSIX_LOCK;
+        if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+                        server->caps |= NFS_CAP_READDIRPLUS;
        server->options = data->options;
        /* Get a client record */
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e257172d438c..07ac3847e562 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,11 +33,12 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
+#include <linux/vmalloc.h>
-#include "nfs4_fs.h"
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 /* #define NFS_DEBUG_VERBOSE 1 */
@@ -55,6 +56,7 @@ static int nfs_rename(struct inode *, struct dentry *,
                      struct inode *, struct dentry *);
 static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
+static int nfs_readdir_clear_array(struct page*, gfp_t);
 const struct file_operations nfs_dir_operations = {
        .llseek         = nfs_llseek_dir,
@@ -80,6 +82,10 @@ const struct inode_operations nfs_dir_inode_operations = {
        .setattr        = nfs_setattr,
 };
+const struct address_space_operations nfs_dir_addr_space_ops = {
+        .releasepage = nfs_readdir_clear_array,
+};
 #ifdef CONFIG_NFS_V3
 const struct inode_operations nfs3_dir_inode_operations = {
        .create         = nfs_create,
@@ -104,8 +110,9 @@ const struct inode_operations nfs3_dir_inode_operations = {
 #ifdef CONFIG_NFS_V4
 static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd);
 const struct inode_operations nfs4_dir_inode_operations = {
-        .create         = nfs_create,
+        .create         = nfs_open_create,
        .lookup         = nfs_atomic_lookup,
        .link           = nfs_link,
        .unlink         = nfs_unlink,
@@ -150,51 +157,197 @@ nfs_opendir(struct inode *inode, struct file *filp)
        return res;
 }
-typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int);
+struct nfs_cache_array_entry {
+        u64 cookie;
+        u64 ino;
+        struct qstr string;
+};
+struct nfs_cache_array {
+        unsigned int size;
+        int eof_index;
+        u64 last_cookie;
+        struct nfs_cache_array_entry array[0];
+};
+#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
+typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 typedef struct {
        struct file     *file;
        struct page     *page;
        unsigned long   page_index;
-        __be32          *ptr;
        u64             *dir_cookie;
        loff_t          current_index;
-        struct nfs_entry *entry;
        decode_dirent_t decode;
-        int             plus;
        unsigned long   timestamp;
        unsigned long   gencount;
-        int             timestamp_valid;
+        unsigned int    cache_entry_index;
+        unsigned int    plus:1;
+        unsigned int    eof:1;
 } nfs_readdir_descriptor_t;
-/* Now we cache directories properly, by stuffing the dirent
+/*
- * data directly in the page cache.
+ * The caller is responsible for calling nfs_readdir_release_array(page)
- *
- * Inode invalidation due to refresh etc. takes care of
- * _everything_, no sloppy entry flushing logic, no extraneous
- * copying, network direct to page cache, the way it was meant
- * to be.
- *
- * NOTE: Dirent information verification is done always by the
- *       page-in of the RPC reply, nowhere else, this simplies
- *       things substantially.
 */
 static
-int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
+struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
+{
+        if (page == NULL)
+                return ERR_PTR(-EIO);
+        return (struct nfs_cache_array *)kmap(page);
+}
+static
+void nfs_readdir_release_array(struct page *page)
+{
+        kunmap(page);
+}
+/*
+ * we are freeing strings created by nfs_add_to_readdir_array()
+ */
+static
+int nfs_readdir_clear_array(struct page *page, gfp_t mask)
+{
+        struct nfs_cache_array *array = nfs_readdir_get_array(page);
+        int i;
+        for (i = 0; i < array->size; i++)
+                kfree(array->array[i].string.name);
+        nfs_readdir_release_array(page);
+        return 0;
+}
+/*
+ * the caller is responsible for freeing qstr.name
+ * when called by nfs_readdir_add_to_array, the strings will be freed in
+ * nfs_clear_readdir_array()
+ */
+static
+int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+{
+        string->len = len;
+        string->name = kmemdup(name, len, GFP_KERNEL);
+        if (string->name == NULL)
+                return -ENOMEM;
+        string->hash = full_name_hash(name, len);
+        return 0;
+}
+static
+int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+{
+        struct nfs_cache_array *array = nfs_readdir_get_array(page);
+        struct nfs_cache_array_entry *cache_entry;
+        int ret;
+        if (IS_ERR(array))
+                return PTR_ERR(array);
+        ret = -EIO;
+        if (array->size >= MAX_READDIR_ARRAY)
+                goto out;
+        cache_entry = &array->array[array->size];
+        cache_entry->cookie = entry->prev_cookie;
+        cache_entry->ino = entry->ino;
+        ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
+        if (ret)
+                goto out;
+        array->last_cookie = entry->cookie;
+        if (entry->eof == 1)
+                array->eof_index = array->size;
+        array->size++;
+out:
+        nfs_readdir_release_array(page);
+        return ret;
+}
+static
+int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+        loff_t diff = desc->file->f_pos - desc->current_index;
+        unsigned int index;
+        if (diff < 0)
+                goto out_eof;
+        if (diff >= array->size) {
+                if (array->eof_index > 0)
+                        goto out_eof;
+                desc->current_index += array->size;
+                return -EAGAIN;
+        }
+        index = (unsigned int)diff;
+        *desc->dir_cookie = array->array[index].cookie;
+        desc->cache_entry_index = index;
+        if (index == array->eof_index)
+                desc->eof = 1;
+        return 0;
+out_eof:
+        desc->eof = 1;
+        return -EBADCOOKIE;
+}
+static
+int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+        int i;
+        int status = -EAGAIN;
+        for (i = 0; i < array->size; i++) {
+                if (i == array->eof_index) {
+                        desc->eof = 1;
+                        status = -EBADCOOKIE;
+                }
+                if (array->array[i].cookie == *desc->dir_cookie) {
+                        desc->cache_entry_index = i;
+                        status = 0;
+                        break;
+                }
+        }
+        return status;
+}
+static
+int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
+{
+        struct nfs_cache_array *array;
+        int status = -EBADCOOKIE;
+        if (desc->dir_cookie == NULL)
+                goto out;
+        array = nfs_readdir_get_array(desc->page);
+        if (IS_ERR(array)) {
+                status = PTR_ERR(array);
+                goto out;
+        }
+        if (*desc->dir_cookie == 0)
+                status = nfs_readdir_search_for_pos(array, desc);
+        else
+                status = nfs_readdir_search_for_cookie(array, desc);
+        nfs_readdir_release_array(desc->page);
+out:
+        return status;
+}
+/* Fill a page with xdr information before transferring to the cache page */
+static
+int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
+                        struct nfs_entry *entry, struct file *file, struct inode *inode)
 {
-        struct file     *file = desc->file;
-        struct inode    *inode = file->f_path.dentry->d_inode;
        struct rpc_cred *cred = nfs_file_cred(file);
        unsigned long   timestamp, gencount;
        int             error;
-        dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
-                        __func__, (long long)desc->entry->cookie,
-                        page->index);
 again:
        timestamp = jiffies;
        gencount = nfs_inc_attr_generation_counter();
-        error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page,
+        error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
                                          NFS_SERVER(inode)->dtsize, desc->plus);
        if (error < 0) {
                /* We requested READDIRPLUS, but the server doesn't grok it */
@@ -208,190 +361,292 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
        }
        desc->timestamp = timestamp;
        desc->gencount = gencount;
-        desc->timestamp_valid = 1;
+error:
-        SetPageUptodate(page);
+        return error;
-        /* Ensure consistent page alignment of the data.
-         * Note: assumes we have exclusive access to this mapping either
-         *       through inode->i_mutex or some other mechanism.
-         */
-        if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
-                /* Should never happen */
-                nfs_zap_mapping(inode, inode->i_mapping);
-        }
-        unlock_page(page);
-        return 0;
- error:
-        unlock_page(page);
-        return -EIO;
 }
-static inline
+/* Fill in an entry based on the xdr code stored in desc->page */
-int dir_decode(nfs_readdir_descriptor_t *desc)
+static
+int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
 {
-        __be32  *p = desc->ptr;
+        __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
-        p = desc->decode(p, desc->entry, desc->plus);
        if (IS_ERR(p))
                return PTR_ERR(p);
-        desc->ptr = p;
-        if (desc->timestamp_valid) {
+        entry->fattr->time_start = desc->timestamp;
-                desc->entry->fattr->time_start = desc->timestamp;
+        entry->fattr->gencount = desc->gencount;
-                desc->entry->fattr->gencount = desc->gencount;
-        } else
-                desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
        return 0;
 }
-static inline
+static
-void dir_page_release(nfs_readdir_descriptor_t *desc)
+int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
 {
-        kunmap(desc->page);
+        struct nfs_inode *node;
-        page_cache_release(desc->page);
+        if (dentry->d_inode == NULL)
-        desc->page = NULL;
+                goto different;
-        desc->ptr = NULL;
+        node = NFS_I(dentry->d_inode);
+        if (node->fh.size != entry->fh->size)
+                goto different;
+        if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
+                goto different;
+        return 1;
+different:
+        return 0;
 }
-/*
+static
- * Given a pointer to a buffer that has already been filled by a call
+void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
- * to readdir, find the next entry with cookie '*desc->dir_cookie'.
- *
- * If the end of the buffer has been reached, return -EAGAIN, if not,
- * return the offset within the buffer of the next entry to be
- * read.
- */
-static inline
-int find_dirent(nfs_readdir_descriptor_t *desc)
 {
-        struct nfs_entry *entry = desc->entry;
+        struct qstr filename = {
-        int             loop_count = 0,
+                .len = entry->len,
-                        status;
+                .name = entry->name,
+        };
+        struct dentry *dentry;
+        struct dentry *alias;
+        struct inode *dir = parent->d_inode;
+        struct inode *inode;
-        while((status = dir_decode(desc)) == 0) {
+        if (filename.name[0] == '.') {
-                dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n",
+                if (filename.len == 1)
-                                __func__, (unsigned long long)entry->cookie);
+                        return;
-                if (entry->prev_cookie == *desc->dir_cookie)
+                if (filename.len == 2 && filename.name[1] == '.')
-                        break;
+                        return;
-                if (loop_count++ > 200) {
+        }
-                        loop_count = 0;
+        filename.hash = full_name_hash(filename.name, filename.len);
-                        schedule();
+        dentry = d_lookup(parent, &filename);
+        if (dentry != NULL) {
+                if (nfs_same_file(dentry, entry)) {
+                        nfs_refresh_inode(dentry->d_inode, entry->fattr);
+                        goto out;
+                } else {
+                        d_drop(dentry);
+                        dput(dentry);
                }
        }
-        return status;
+        dentry = d_alloc(parent, &filename);
+        if (dentry == NULL)
+                return;
+        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+        if (IS_ERR(inode))
+                goto out;
+        alias = d_materialise_unique(dentry, inode);
+        if (IS_ERR(alias))
+                goto out;
+        else if (alias) {
+                nfs_set_verifier(alias, nfs_save_change_attribute(dir));
+                dput(alias);
+        } else
+                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+out:
+        dput(dentry);
+}
+/* Perform conversion from xdr to cache array */
+static
+void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+                                void *xdr_page, struct page *page, unsigned int buflen)
+{
+        struct xdr_stream stream;
+        struct xdr_buf buf;
+        __be32 *ptr = xdr_page;
+        int status;
+        struct nfs_cache_array *array;
+        buf.head->iov_base = xdr_page;
+        buf.head->iov_len = buflen;
+        buf.tail->iov_len = 0;
+        buf.page_base = 0;
+        buf.page_len = 0;
+        buf.buflen = buf.head->iov_len;
+        buf.len = buf.head->iov_len;
+        xdr_init_decode(&stream, &buf, ptr);
+        do {
+                status = xdr_decode(desc, entry, &stream);
+                if (status != 0)
+                        break;
+                if (nfs_readdir_add_to_array(entry, page) == -1)
+                        break;
+                if (desc->plus == 1)
+                        nfs_prime_dcache(desc->file->f_path.dentry, entry);
+        } while (!entry->eof);
+        if (status == -EBADCOOKIE && entry->eof) {
+                array = nfs_readdir_get_array(page);
+                array->eof_index = array->size - 1;
+                status = 0;
+                nfs_readdir_release_array(page);
+        }
+}
+static
+void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+{
+        unsigned int i;
+        for (i = 0; i < npages; i++)
+                put_page(pages[i]);
+}
+static
+void nfs_readdir_free_large_page(void *ptr, struct page **pages,
+                unsigned int npages)
+{
+        vm_unmap_ram(ptr, npages);
+        nfs_readdir_free_pagearray(pages, npages);
 }
 /*
- * Given a pointer to a buffer that has already been filled by a call
+ * nfs_readdir_large_page will allocate pages that must be freed with a call
- * to readdir, find the entry at offset 'desc->file->f_pos'.
+ * to nfs_readdir_free_large_page
- *
- * If the end of the buffer has been reached, return -EAGAIN, if not,
- * return the offset within the buffer of the next entry to be
- * read.
 */
-static inline
+static
-int find_dirent_index(nfs_readdir_descriptor_t *desc)
+void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
 {
-        struct nfs_entry *entry = desc->entry;
+        void *ptr;
-        int             loop_count = 0,
+        unsigned int i;
-                        status;
+        for (i = 0; i < npages; i++) {
+                struct page *page = alloc_page(GFP_KERNEL);
+                if (page == NULL)
+                        goto out_freepages;
+                pages[i] = page;
+        }
-        for(;;) {
+        ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
-                status = dir_decode(desc);
+        if (!IS_ERR_OR_NULL(ptr))
-                if (status)
+                return ptr;
-                        break;
+out_freepages:
+        nfs_readdir_free_pagearray(pages, i);
+        return NULL;
+}
+static
+int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
+{
+        struct page *pages[NFS_MAX_READDIR_PAGES];
+        void *pages_ptr = NULL;
+        struct nfs_entry entry;
+        struct file     *file = desc->file;
+        struct nfs_cache_array *array;
+        int status = 0;
+        unsigned int array_size = ARRAY_SIZE(pages);
+        entry.prev_cookie = 0;
+        entry.cookie = *desc->dir_cookie;
+        entry.eof = 0;
+        entry.fh = nfs_alloc_fhandle();
+        entry.fattr = nfs_alloc_fattr();
+        if (entry.fh == NULL || entry.fattr == NULL)
+                goto out;
-                dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n",
+        array = nfs_readdir_get_array(page);
-                                (unsigned long long)entry->cookie, desc->current_index);
+        memset(array, 0, sizeof(struct nfs_cache_array));
+        array->eof_index = -1;
-                if (desc->file->f_pos == desc->current_index) {
+        pages_ptr = nfs_readdir_large_page(pages, array_size);
-                        *desc->dir_cookie = entry->cookie;
+        if (!pages_ptr)
+                goto out_release_array;
+        do {
+                status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
+                if (status < 0)
                        break;
-                }
+                nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE);
-                desc->current_index++;
+        } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
-                if (loop_count++ > 200) {
-                        loop_count = 0;
+        nfs_readdir_free_large_page(pages_ptr, pages, array_size);
-                        schedule();
+out_release_array:
-                }
+        nfs_readdir_release_array(page);
-        }
+out:
+        nfs_free_fattr(entry.fattr);
+        nfs_free_fhandle(entry.fh);
        return status;
 }
 /*
- * Find the given page, and call find_dirent() or find_dirent_index in
+ * Now we cache directories properly, by converting xdr information
- * order to try to return the next entry.
+ * to an array that can be used for lookups later.  This results in
+ * fewer cache pages, since we can store more information on each page.
+ * We only need to convert from xdr once so future lookups are much simpler
 */
-static inline
+static
-int find_dirent_page(nfs_readdir_descriptor_t *desc)
+int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
 {
        struct inode    *inode = desc->file->f_path.dentry->d_inode;
-        struct page     *page;
-        int             status;
-        dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n",
+        if (nfs_readdir_xdr_to_array(desc, page, inode) < 0)
-                        __func__, desc->page_index,
+                goto error;
-                        (long long) *desc->dir_cookie);
+        SetPageUptodate(page);
-        /* If we find the page in the page_cache, we cannot be sure
+        if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
-         * how fresh the data is, so we will ignore readdir_plus attributes.
+                /* Should never happen */
-         */
+                nfs_zap_mapping(inode, inode->i_mapping);
-        desc->timestamp_valid = 0;
-        page = read_cache_page(inode->i_mapping, desc->page_index,
-                               (filler_t *)nfs_readdir_filler, desc);
-        if (IS_ERR(page)) {
-                status = PTR_ERR(page);
-                goto out;
        }
+        unlock_page(page);
+        return 0;
+ error:
+        unlock_page(page);
+        return -EIO;
+}
-        /* NOTE: Someone else may have changed the READDIRPLUS flag */
+static
-        desc->page = page;
+void cache_page_release(nfs_readdir_descriptor_t *desc)
-        desc->ptr = kmap(page);         /* matching kunmap in nfs_do_filldir */
+{
-        if (*desc->dir_cookie != 0)
+        page_cache_release(desc->page);
-                status = find_dirent(desc);
+        desc->page = NULL;
-        else
+}
-                status = find_dirent_index(desc);
-        if (status < 0)
+static
-                dir_page_release(desc);
+struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
- out:
+{
-        dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
+        struct page *page;
-        return status;
+        page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+                        desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+        if (IS_ERR(page))
+                desc->eof = 1;
+        return page;
 }
 /*
- * Recurse through the page cache pages, and return a
+ * Returns 0 if desc->dir_cookie was found on page desc->page_index
- * filled nfs_entry structure of the next directory entry if possible.
- *
- * The target for the search is '*desc->dir_cookie' if non-0,
- * 'desc->file->f_pos' otherwise
 */
+static
+int find_cache_page(nfs_readdir_descriptor_t *desc)
+{
+        int res;
+        desc->page = get_cache_page(desc);
+        if (IS_ERR(desc->page))
+                return PTR_ERR(desc->page);
+        res = nfs_readdir_search_array(desc);
+        if (res == 0)
+                return 0;
+        cache_page_release(desc);
+        return res;
+}
+/* Search for desc->dir_cookie from the beginning of the page cache */
 static inline
 int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 {
-        int             loop_count = 0;
+        int res = -EAGAIN;
-        int             res;
-        /* Always search-by-index from the beginning of the cache */
-        if (*desc->dir_cookie == 0) {
-                dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n",
-                                (long long)desc->file->f_pos);
-                desc->page_index = 0;
-                desc->entry->cookie = desc->entry->prev_cookie = 0;
-                desc->entry->eof = 0;
-                desc->current_index = 0;
-        } else
-                dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n",
-                                (unsigned long long)*desc->dir_cookie);
-        for (;;) {
+        while (1) {
-                res = find_dirent_page(desc);
+                res = find_cache_page(desc);
                if (res != -EAGAIN)
                        break;
-                /* Align to beginning of next page */
+                desc->page_index++;
-                desc->page_index ++;
-                if (loop_count++ > 200) {
-                        loop_count = 0;
-                        schedule();
-                }
        }
-        dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res);
        return res;
 }
@@ -400,8 +655,6 @@ static inline unsigned int dt_type(struct inode *inode)
        return (inode->i_mode >> 12) & 15;
 }
-static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc);
 /*
 * Once we've found the start of the dirent within a page: fill 'er up...
 */
@@ -410,49 +663,36 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
                   filldir_t filldir)
 {
        struct file     *file = desc->file;
-        struct nfs_entry *entry = desc->entry;
+        int i = 0;
-        struct dentry   *dentry = NULL;
+        int res = 0;
-        u64             fileid;
+        struct nfs_cache_array *array = NULL;
-        int             loop_count = 0,
+        unsigned int d_type = DT_UNKNOWN;
-                        res;
+        struct dentry *dentry = NULL;
-        dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n",
-                        (unsigned long long)entry->cookie);
-        for(;;) {
-                unsigned d_type = DT_UNKNOWN;
-                /* Note: entry->prev_cookie contains the cookie for
-                 *       retrieving the current dirent on the server */
-                fileid = entry->ino;
-                /* Get a dentry if we have one */
-                if (dentry != NULL)
-                        dput(dentry);
-                dentry = nfs_readdir_lookup(desc);
-                /* Use readdirplus info */
+        array = nfs_readdir_get_array(desc->page);
-                if (dentry != NULL && dentry->d_inode != NULL) {
-                        d_type = dt_type(dentry->d_inode);
-                        fileid = NFS_FILEID(dentry->d_inode);
-                }
-                res = filldir(dirent, entry->name, entry->len, 
+        for (i = desc->cache_entry_index; i < array->size; i++) {
-                              file->f_pos, nfs_compat_user_ino64(fileid),
+                d_type = DT_UNKNOWN;
-                              d_type);
+                res = filldir(dirent, array->array[i].string.name,
+                        array->array[i].string.len, file->f_pos,
+                        nfs_compat_user_ino64(array->array[i].ino), d_type);
                if (res < 0)
                        break;
                file->f_pos++;
-                *desc->dir_cookie = entry->cookie;
+                desc->cache_entry_index = i;
-                if (dir_decode(desc) != 0) {
+                if (i < (array->size-1))
-                        desc->page_index ++;
+                        *desc->dir_cookie = array->array[i+1].cookie;
+                else
+                        *desc->dir_cookie = array->last_cookie;
+                if (i == array->eof_index) {
+                        desc->eof = 1;
                        break;
                }
-                if (loop_count++ > 200) {
-                        loop_count = 0;
-                        schedule();
-                }
        }
-        dir_page_release(desc);
+        nfs_readdir_release_array(desc->page);
+        cache_page_release(desc);
        if (dentry != NULL)
                dput(dentry);
        dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
@@ -476,12 +716,9 @@ static inline
 int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
                     filldir_t filldir)
 {
-        struct file     *file = desc->file;
-        struct inode    *inode = file->f_path.dentry->d_inode;
-        struct rpc_cred *cred = nfs_file_cred(file);
        struct page     *page = NULL;
        int             status;
-        unsigned long   timestamp, gencount;
+        struct inode *inode = desc->file->f_path.dentry->d_inode;
        dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
                        (unsigned long long)*desc->dir_cookie);
@@ -491,38 +728,22 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
                status = -ENOMEM;
                goto out;
        }
-        timestamp = jiffies;
-        gencount = nfs_inc_attr_generation_counter();
+        if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
-        status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
-                                                *desc->dir_cookie, page,
-                                                NFS_SERVER(inode)->dtsize,
-                                                desc->plus);
-        desc->page = page;
-        desc->ptr = kmap(page);         /* matching kunmap in nfs_do_filldir */
-        if (status >= 0) {
-                desc->timestamp = timestamp;
-                desc->gencount = gencount;
-                desc->timestamp_valid = 1;
-                if ((status = dir_decode(desc)) == 0)
-                        desc->entry->prev_cookie = *desc->dir_cookie;
-        } else
                status = -EIO;
-        if (status < 0)
                goto out_release;
+        }
+        desc->page_index = 0;
+        desc->page = page;
        status = nfs_do_filldir(desc, dirent, filldir);
-        /* Reset read descriptor so it searches the page cache from
-         * the start upon the next call to readdir_search_pagecache() */
-        desc->page_index = 0;
-        desc->entry->cookie = desc->entry->prev_cookie = 0;
-        desc->entry->eof = 0;
 out:
        dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
                        __func__, status);
        return status;
 out_release:
-        dir_page_release(desc);
+        cache_page_release(desc);
        goto out;
 }
@@ -536,7 +757,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct inode    *inode = dentry->d_inode;
        nfs_readdir_descriptor_t my_desc,
                        *desc = &my_desc;
-        struct nfs_entry my_entry;
        int res = -ENOMEM;
        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
@@ -557,26 +777,17 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        desc->decode = NFS_PROTO(inode)->decode_dirent;
        desc->plus = NFS_USE_READDIRPLUS(inode);
-        my_entry.cookie = my_entry.prev_cookie = 0;
-        my_entry.eof = 0;
-        my_entry.fh = nfs_alloc_fhandle();
-        my_entry.fattr = nfs_alloc_fattr();
-        if (my_entry.fh == NULL || my_entry.fattr == NULL)
-                goto out_alloc_failed;
-        desc->entry = &my_entry;
        nfs_block_sillyrename(dentry);
        res = nfs_revalidate_mapping(inode, filp->f_mapping);
        if (res < 0)
                goto out;
-        while(!desc->entry->eof) {
+        while (desc->eof != 1) {
                res = readdir_search_pagecache(desc);
                if (res == -EBADCOOKIE) {
                        /* This means either end of directory */
-                        if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) {
+                        if (*desc->dir_cookie && desc->eof == 0) {
                                /* Or that the server has 'lost' a cookie */
                                res = uncached_readdir(desc, dirent, filldir);
                                if (res >= 0)
@@ -588,8 +799,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                if (res == -ETOOSMALL && desc->plus) {
                        clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        nfs_zap_caches(inode);
+                        desc->page_index = 0;
                        desc->plus = 0;
-                        desc->entry->eof = 0;
+                        desc->eof = 0;
                        continue;
                }
                if (res < 0)
@@ -605,9 +817,6 @@ out:
        nfs_unblock_sillyrename(dentry);
        if (res > 0)
                res = 0;
-out_alloc_failed:
-        nfs_free_fattr(my_entry.fattr);
-        nfs_free_fhandle(my_entry.fh);
        dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        res);
@@ -1029,10 +1238,63 @@ static int is_atomic_open(struct nameidata *nd)
        return 1;
 }
+static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd)
+{
+        struct path path = {
+                .mnt = nd->path.mnt,
+                .dentry = dentry,
+        };
+        struct nfs_open_context *ctx;
+        struct rpc_cred *cred;
+        fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+        cred = rpc_lookup_cred();
+        if (IS_ERR(cred))
+                return ERR_CAST(cred);
+        ctx = alloc_nfs_open_context(&path, cred, fmode);
+        put_rpccred(cred);
+        if (ctx == NULL)
+                return ERR_PTR(-ENOMEM);
+        return ctx;
+}
+static int do_open(struct inode *inode, struct file *filp)
+{
+        nfs_fscache_set_inode_cookie(inode, filp);
+        return 0;
+}
+static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx)
+{
+        struct file *filp;
+        int ret = 0;
+        /* If the open_intent is for execute, we have an extra check to make */
+        if (ctx->mode & FMODE_EXEC) {
+                ret = nfs_may_open(ctx->path.dentry->d_inode,
+                                ctx->cred,
+                                nd->intent.open.flags);
+                if (ret < 0)
+                        goto out;
+        }
+        filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open);
+        if (IS_ERR(filp))
+                ret = PTR_ERR(filp);
+        else
+                nfs_file_set_open_context(filp, ctx);
+out:
+        put_nfs_open_context(ctx);
+        return ret;
+}
 static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
+        struct nfs_open_context *ctx;
+        struct iattr attr;
        struct dentry *res = NULL;
-        int error;
+        struct inode *inode;
+        int open_flags;
+        int err;
        dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1054,13 +1316,32 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                goto out;
        }
+        ctx = nameidata_to_nfs_open_context(dentry, nd);
+        res = ERR_CAST(ctx);
+        if (IS_ERR(ctx))
+                goto out;
+        open_flags = nd->intent.open.flags;
+        if (nd->flags & LOOKUP_CREATE) {
+                attr.ia_mode = nd->intent.open.create_mode;
+                attr.ia_valid = ATTR_MODE;
+                if (!IS_POSIXACL(dir))
+                        attr.ia_mode &= ~current_umask();
+        } else {
+                open_flags &= ~(O_EXCL | O_CREAT);
+                attr.ia_valid = 0;
+        }
        /* Open the file on the server */
-        res = nfs4_atomic_open(dir, dentry, nd);
+        nfs_block_sillyrename(dentry->d_parent);
-        if (IS_ERR(res)) {
+        inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
-                error = PTR_ERR(res);
+        if (IS_ERR(inode)) {
-                switch (error) {
+                nfs_unblock_sillyrename(dentry->d_parent);
+                put_nfs_open_context(ctx);
+                switch (PTR_ERR(inode)) {
                        /* Make a negative dentry */
                        case -ENOENT:
+                                d_add(dentry, NULL);
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
@@ -1072,11 +1353,25 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                        goto no_open;
                        /* case -EINVAL: */
                        default:
+                                res = ERR_CAST(inode);
                                goto out;
                }
-        } else if (res != NULL)
+        }
+        res = d_add_unique(dentry, inode);
+        nfs_unblock_sillyrename(dentry->d_parent);
+        if (res != NULL) {
+                dput(ctx->path.dentry);
+                ctx->path.dentry = dget(res);
                dentry = res;
+        }
+        err = nfs_intent_set_file(nd, ctx);
+        if (err < 0) {
+                if (res != NULL)
+                        dput(res);
+                return ERR_PTR(err);
+        }
 out:
+        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        return res;
 no_open:
        return nfs_lookup(dir, dentry, nd);
@@ -1087,12 +1382,15 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        struct dentry *parent = NULL;
        struct inode *inode = dentry->d_inode;
        struct inode *dir;
+        struct nfs_open_context *ctx;
        int openflags, ret = 0;
        if (!is_atomic_open(nd) || d_mountpoint(dentry))
                goto no_open;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
        /* We can't create new files in nfs_open_revalidate(), so we
         * optimize away revalidation of negative dentries.
         */
@@ -1112,99 +1410,96 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        /* We can't create new files, or truncate existing ones here */
        openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
+        ctx = nameidata_to_nfs_open_context(dentry, nd);
+        ret = PTR_ERR(ctx);
+        if (IS_ERR(ctx))
+                goto out;
        /*
         * Note: we're not holding inode->i_mutex and so may be racing with
         * operations that change the directory. We therefore save the
         * change attribute *before* we do the RPC call.
         */
-        ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
+        inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
+        if (IS_ERR(inode)) {
+                ret = PTR_ERR(inode);
+                switch (ret) {
+                case -EPERM:
+                case -EACCES:
+                case -EDQUOT:
+                case -ENOSPC:
+                case -EROFS:
+                        goto out_put_ctx;
+                default:
+                        goto out_drop;
+                }
+        }
+        iput(inode);
+        if (inode != dentry->d_inode)
+                goto out_drop;
+        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+        ret = nfs_intent_set_file(nd, ctx);
+        if (ret >= 0)
+                ret = 1;
 out:
        dput(parent);
-        if (!ret)
-                d_drop(dentry);
        return ret;
+out_drop:
+        d_drop(dentry);
+        ret = 0;
+out_put_ctx:
+        put_nfs_open_context(ctx);
+        goto out;
 no_open_dput:
        dput(parent);
 no_open:
        return nfs_lookup_revalidate(dentry, nd);
 }
-#endif /* CONFIG_NFSV4 */
-static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
+                struct nameidata *nd)
 {
-        struct dentry *parent = desc->file->f_path.dentry;
+        struct nfs_open_context *ctx = NULL;
-        struct inode *dir = parent->d_inode;
+        struct iattr attr;
-        struct nfs_entry *entry = desc->entry;
+        int error;
-        struct dentry *dentry, *alias;
+        int open_flags = 0;
-        struct qstr name = {
-                .name = entry->name,
-                .len = entry->len,
-        };
-        struct inode *inode;
-        unsigned long verf = nfs_save_change_attribute(dir);
-        switch (name.len) {
+        dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
-                case 2:
+                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
-                        if (name.name[0] == '.' && name.name[1] == '.')
-                                return dget_parent(parent);
-                        break;
-                case 1:
-                        if (name.name[0] == '.')
-                                return dget(parent);
-        }
-        spin_lock(&dir->i_lock);
+        attr.ia_mode = mode;
-        if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) {
+        attr.ia_valid = ATTR_MODE;
-                spin_unlock(&dir->i_lock);
-                return NULL;
-        }
-        spin_unlock(&dir->i_lock);
-        name.hash = full_name_hash(name.name, name.len);
+        if ((nd->flags & LOOKUP_CREATE) != 0) {
-        dentry = d_lookup(parent, &name);
+                open_flags = nd->intent.open.flags;
-        if (dentry != NULL) {
-                /* Is this a positive dentry that matches the readdir info? */
-                if (dentry->d_inode != NULL &&
-                                (NFS_FILEID(dentry->d_inode) == entry->ino ||
-                                d_mountpoint(dentry))) {
-                        if (!desc->plus || entry->fh->size == 0)
-                                return dentry;
-                        if (nfs_compare_fh(NFS_FH(dentry->d_inode),
-                                                entry->fh) == 0)
-                                goto out_renew;
-                }
-                /* No, so d_drop to allow one to be created */
-                d_drop(dentry);
-                dput(dentry);
-        }
-        if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
-                return NULL;
-        if (name.len > NFS_SERVER(dir)->namelen)
-                return NULL;
-        /* Note: caller is already holding the dir->i_mutex! */
-        dentry = d_alloc(parent, &name);
-        if (dentry == NULL)
-                return NULL;
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
-        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
-        if (IS_ERR(inode)) {
-                dput(dentry);
-                return NULL;
-        }
-        alias = d_materialise_unique(dentry, inode);
+                ctx = nameidata_to_nfs_open_context(dentry, nd);
-        if (alias != NULL) {
+                error = PTR_ERR(ctx);
-                dput(dentry);
+                if (IS_ERR(ctx))
-                if (IS_ERR(alias))
+                        goto out_err_drop;
-                        return NULL;
-                dentry = alias;
        }
-out_renew:
+        error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
-        nfs_set_verifier(dentry, verf);
+        if (error != 0)
-        return dentry;
+                goto out_put_ctx;
+        if (ctx != NULL) {
+                error = nfs_intent_set_file(nd, ctx);
+                if (error < 0)
+                        goto out_err;
+        }
+        return 0;
+out_put_ctx:
+        if (ctx != NULL)
+                put_nfs_open_context(ctx);
+out_err_drop:
+        d_drop(dentry);
+out_err:
+        return error;
 }
+#endif /* CONFIG_NFSV4 */
 /*
 * Code common to create, mkdir, and mknod.
 */
@@ -1258,7 +1553,6 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 {
        struct iattr attr;
        int error;
-        int open_flags = 0;
        dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1266,10 +1560,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
        attr.ia_mode = mode;
        attr.ia_valid = ATTR_MODE;
-        if ((nd->flags & LOOKUP_CREATE) != 0)
+        error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
-                open_flags = nd->intent.open.flags;
-        error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
        if (error != 0)
                goto out_err;
        return 0;
@@ -1351,76 +1642,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
        return error;
 }
-static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
-{
-        static unsigned int sillycounter;
-        const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
-        const int      countersize = sizeof(sillycounter)*2;
-        const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
-        char           silly[slen+1];
-        struct qstr    qsilly;
-        struct dentry *sdentry;
-        int            error = -EIO;
-        dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
-                dentry->d_parent->d_name.name, dentry->d_name.name, 
-                atomic_read(&dentry->d_count));
-        nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
-        /*
-         * We don't allow a dentry to be silly-renamed twice.
-         */
-        error = -EBUSY;
-        if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
-                goto out;
-        sprintf(silly, ".nfs%*.*Lx",
-                fileidsize, fileidsize,
-                (unsigned long long)NFS_FILEID(dentry->d_inode));
-        /* Return delegation in anticipation of the rename */
-        nfs_inode_return_delegation(dentry->d_inode);
-        sdentry = NULL;
-        do {
-                char *suffix = silly + slen - countersize;
-                dput(sdentry);
-                sillycounter++;
-                sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
-                dfprintk(VFS, "NFS: trying to rename %s to %s\n",
-                                dentry->d_name.name, silly);
-                
-                sdentry = lookup_one_len(silly, dentry->d_parent, slen);
-                /*
-                 * N.B. Better to return EBUSY here ... it could be
-                 * dangerous to delete the file while it's in use.
-                 */
-                if (IS_ERR(sdentry))
-                        goto out;
-        } while(sdentry->d_inode != NULL); /* need negative lookup */
-        qsilly.name = silly;
-        qsilly.len  = strlen(silly);
-        if (dentry->d_inode) {
-                error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
-                                dir, &qsilly);
-                nfs_mark_for_revalidate(dentry->d_inode);
-        } else
-                error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
-                                dir, &qsilly);
-        if (!error) {
-                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-                d_move(dentry, sdentry);
-                error = nfs_async_unlink(dir, dentry);
-                /* If we return 0 we don't unlink */
-        }
-        dput(sdentry);
-out:
-        return error;
-}
 /*
 * Remove a file after making sure there are no pending writes,
 * and after checking that the file has only one user. 
@@ -1580,7 +1801,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
        d_drop(dentry);
        error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
        if (error == 0) {
-                atomic_inc(&inode->i_count);
+                ihold(inode);
                d_add(dentry, inode);
        }
        return error;
@@ -1711,14 +1932,14 @@ static void nfs_access_free_list(struct list_head *head)
 int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
        LIST_HEAD(head);
-        struct nfs_inode *nfsi;
+        struct nfs_inode *nfsi, *next;
        struct nfs_access_entry *cache;
        if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
                return (nr_to_scan == 0) ? 0 : -1;
        spin_lock(&nfs_access_lru_lock);
-        list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
+        list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
                struct inode *inode;
                if (nr_to_scan-- == 0)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 064a80961677..84d3c8b90206 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -873,7 +873,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
        dreq->inode = inode;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
-        if (dreq->l_ctx != NULL)
+        if (dreq->l_ctx == NULL)
                goto out_release;
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index dba50a5625db..a6e711ad130f 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -167,7 +167,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
                return 0;
        }
        item = container_of(h, struct nfs_dns_ent, h);
-        ttl = (long)item->h.expiry_time - (long)get_seconds();
+        ttl = item->h.expiry_time - seconds_since_boot();
        if (ttl < 0)
                ttl = 0;
@@ -239,7 +239,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
        ttl = get_expiry(&buf);
        if (ttl == 0)
                goto out;
-        key.h.expiry_time = ttl + get_seconds();
+        key.h.expiry_time = ttl + seconds_since_boot();
        ret = -ENOMEM;
        item = nfs_dns_lookup(cd, &key);
@@ -301,7 +301,7 @@ static int do_cache_lookup_nowait(struct cache_detail *cd,
                goto out_err;
        ret = -ETIMEDOUT;
        if (!test_bit(CACHE_VALID, &(*item)->h.flags)
-                        || (*item)->h.expiry_time < get_seconds()
+                        || (*item)->h.expiry_time < seconds_since_boot()
                        || cd->flush_time > (*item)->h.last_refresh)
                goto out_put;
        ret = -ENOENT;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05bf3c0dc751..e756075637b0 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_FILE
@@ -386,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                file->f_path.dentry->d_name.name,
                mapping->host->i_ino, len, (long long) pos);
+        pnfs_update_layout(mapping->host,
+                           nfs_file_open_context(file),
+                           IOMODE_RW);
 start:
        /*
         * Prevent starvation issues if someone is doing a consistency
@@ -551,7 +556,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct file *filp = vma->vm_file;
        struct dentry *dentry = filp->f_path.dentry;
        unsigned pagelen;
-        int ret = -EINVAL;
+        int ret = VM_FAULT_NOPAGE;
        struct address_space *mapping;
        dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
@@ -567,21 +572,20 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (mapping != dentry->d_inode->i_mapping)
                goto out_unlock;
-        ret = 0;
        pagelen = nfs_page_length(page);
        if (pagelen == 0)
                goto out_unlock;
-        ret = nfs_flush_incompatible(filp, page);
+        ret = VM_FAULT_LOCKED;
-        if (ret != 0)
+        if (nfs_flush_incompatible(filp, page) == 0 &&
-                goto out_unlock;
+            nfs_updatepage(filp, page, 0, pagelen) == 0)
+                goto out;
-        ret = nfs_updatepage(filp, page, 0, pagelen);
+        ret = VM_FAULT_SIGBUS;
 out_unlock:
-        if (!ret)
-                return VM_FAULT_LOCKED;
        unlock_page(page);
-        return VM_FAULT_SIGBUS;
+out:
+        return ret;
 }
 static const struct vm_operations_struct nfs_file_vm_ops = {
@@ -684,7 +688,8 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
        return ret;
 }
-static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
        struct inode *inode = filp->f_mapping->host;
        int status = 0;
@@ -699,7 +704,7 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
        if (nfs_have_delegation(inode, FMODE_READ))
                goto out_noconflict;
-        if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)
+        if (is_local)
                goto out_noconflict;
        status = NFS_PROTO(inode)->lock(filp, cmd, fl);
@@ -726,7 +731,8 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
        return res;
 }
-static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
        struct inode *inode = filp->f_mapping->host;
        int status;
@@ -741,15 +747,24 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
         *      If we're signalled while cleaning up locks on process exit, we
         *      still need to complete the unlock.
         */
-        /* Use local locking if mounted with "-onolock" */
+        /*
-        if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
+         * Use local locking if mounted with "-onolock" or with appropriate
+         * "-olocal_lock="
+         */
+        if (!is_local)
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
                status = do_vfs_lock(filp, fl);
        return status;
 }
-static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+is_time_granular(struct timespec *ts) {
+        return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
+}
+static int
+do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
        struct inode *inode = filp->f_mapping->host;
        int status;
@@ -762,20 +777,31 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
        if (status != 0)
                goto out;
-        /* Use local locking if mounted with "-onolock" */
+        /*
-        if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
+         * Use local locking if mounted with "-onolock" or with appropriate
+         * "-olocal_lock="
+         */
+        if (!is_local)
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
                status = do_vfs_lock(filp, fl);
        if (status < 0)
                goto out;
        /*
-         * Make sure we clear the cache whenever we try to get the lock.
+         * Revalidate the cache if the server has time stamps granular
+         * enough to detect subsecond changes.  Otherwise, clear the
+         * cache to prevent missing any changes.
+         *
         * This makes locking act as a cache coherency point.
         */
        nfs_sync_mapping(filp->f_mapping);
-        if (!nfs_have_delegation(inode, FMODE_READ))
+        if (!nfs_have_delegation(inode, FMODE_READ)) {
-                nfs_zap_caches(inode);
+                if (is_time_granular(&NFS_SERVER(inode)->time_delta))
+                        __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+                else
+                        nfs_zap_caches(inode);
+        }
 out:
        return status;
 }
@@ -787,6 +813,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
        struct inode *inode = filp->f_mapping->host;
        int ret = -ENOLCK;
+        int is_local = 0;
        dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
                        filp->f_path.dentry->d_parent->d_name.name,
@@ -800,6 +827,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
                goto out_err;
+        if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
+                is_local = 1;
        if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
                ret = NFS_PROTO(inode)->lock_check_bounds(fl);
                if (ret < 0)
@@ -807,11 +837,11 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
        }
        if (IS_GETLK(cmd))
-                ret = do_getlk(filp, cmd, fl);
+                ret = do_getlk(filp, cmd, fl, is_local);
        else if (fl->fl_type == F_UNLCK)
-                ret = do_unlk(filp, cmd, fl);
+                ret = do_unlk(filp, cmd, fl, is_local);
        else
-                ret = do_setlk(filp, cmd, fl);
+                ret = do_setlk(filp, cmd, fl, is_local);
 out_err:
        return ret;
 }
@@ -821,6 +851,9 @@ out_err:
 */
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
+        struct inode *inode = filp->f_mapping->host;
+        int is_local = 0;
        dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
                        filp->f_path.dentry->d_parent->d_name.name,
                        filp->f_path.dentry->d_name.name,
@@ -829,14 +862,17 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
        if (!(fl->fl_flags & FL_FLOCK))
                return -ENOLCK;
+        if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
+                is_local = 1;
        /* We're simulating flock() locks using posix locks on the server */
        fl->fl_owner = (fl_owner_t)filp;
        fl->fl_start = 0;
        fl->fl_end = OFFSET_MAX;
        if (fl->fl_type == F_UNLCK)
-                return do_unlk(filp, cmd, fl);
+                return do_unlk(filp, cmd, fl, is_local);
-        return do_setlk(filp, cmd, fl);
+        return do_setlk(filp, cmd, fl, is_local);
 }
 /*
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a70e446e1605..ac7b814ce162 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -54,8 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
                        iput(inode);
                        return -ENOMEM;
                }
-                /* Circumvent igrab(): we know the inode is not being freed */
+                ihold(inode);
-                atomic_inc(&inode->i_count);
                /*
                 * Ensure that this dentry is invisible to d_find_alias().
                 * Otherwise, it may be spliced into the tree by
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 21a84d45916f..4e2d9b6b1380 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -34,6 +34,212 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
+#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/nfs_idmap.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <keys/user-type.h>
+#define NFS_UINT_MAXLEN 11
+const struct cred *id_resolver_cache;
+struct key_type key_type_id_resolver = {
+        .name           = "id_resolver",
+        .instantiate    = user_instantiate,
+        .match          = user_match,
+        .revoke         = user_revoke,
+        .destroy        = user_destroy,
+        .describe       = user_describe,
+        .read           = user_read,
+};
+int nfs_idmap_init(void)
+{
+        struct cred *cred;
+        struct key *keyring;
+        int ret = 0;
+        printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
+        cred = prepare_kernel_cred(NULL);
+        if (!cred)
+                return -ENOMEM;
+        keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
+                             (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                             KEY_USR_VIEW | KEY_USR_READ,
+                             KEY_ALLOC_NOT_IN_QUOTA);
+        if (IS_ERR(keyring)) {
+                ret = PTR_ERR(keyring);
+                goto failed_put_cred;
+        }
+        ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+        if (ret < 0)
+                goto failed_put_key;
+        ret = register_key_type(&key_type_id_resolver);
+        if (ret < 0)
+                goto failed_put_key;
+        cred->thread_keyring = keyring;
+        cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+        id_resolver_cache = cred;
+        return 0;
+failed_put_key:
+        key_put(keyring);
+failed_put_cred:
+        put_cred(cred);
+        return ret;
+}
+void nfs_idmap_quit(void)
+{
+        key_revoke(id_resolver_cache->thread_keyring);
+        unregister_key_type(&key_type_id_resolver);
+        put_cred(id_resolver_cache);
+}
+/*
+ * Assemble the description to pass to request_key()
+ * This function will allocate a new string and update dest to point
+ * at it.  The caller is responsible for freeing dest.
+ *
+ * On error 0 is returned.  Otherwise, the length of dest is returned.
+ */
+static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
+                                const char *type, size_t typelen, char **desc)
+{
+        char *cp;
+        size_t desclen = typelen + namelen + 2;
+        *desc = kmalloc(desclen, GFP_KERNEL);
+        if (!*desc)
+                return -ENOMEM;
+        cp = *desc;
+        memcpy(cp, type, typelen);
+        cp += typelen;
+        *cp++ = ':';
+        memcpy(cp, name, namelen);
+        cp += namelen;
+        *cp = '\0';
+        return desclen;
+}
+static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
+                const char *type, void *data, size_t data_size)
+{
+        const struct cred *saved_cred;
+        struct key *rkey;
+        char *desc;
+        struct user_key_payload *payload;
+        ssize_t ret;
+        ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
+        if (ret <= 0)
+                goto out;
+        saved_cred = override_creds(id_resolver_cache);
+        rkey = request_key(&key_type_id_resolver, desc, "");
+        revert_creds(saved_cred);
+        kfree(desc);
+        if (IS_ERR(rkey)) {
+                ret = PTR_ERR(rkey);
+                goto out;
+        }
+        rcu_read_lock();
+        rkey->perm |= KEY_USR_VIEW;
+        ret = key_validate(rkey);
+        if (ret < 0)
+                goto out_up;
+        payload = rcu_dereference(rkey->payload.data);
+        if (IS_ERR_OR_NULL(payload)) {
+                ret = PTR_ERR(payload);
+                goto out_up;
+        }
+        ret = payload->datalen;
+        if (ret > 0 && ret <= data_size)
+                memcpy(data, payload->data, ret);
+        else
+                ret = -EINVAL;
+out_up:
+        rcu_read_unlock();
+        key_put(rkey);
+out:
+        return ret;
+}
+/* ID -> Name */
+static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
+{
+        char id_str[NFS_UINT_MAXLEN];
+        int id_len;
+        ssize_t ret;
+        id_len = snprintf(id_str, sizeof(id_str), "%u", id);
+        ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
+        if (ret < 0)
+                return -EINVAL;
+        return ret;
+}
+/* Name -> ID */
+static int nfs_idmap_lookup_id(const char *name, size_t namelen,
+                                const char *type, __u32 *id)
+{
+        char id_str[NFS_UINT_MAXLEN];
+        long id_long;
+        ssize_t data_size;
+        int ret = 0;
+        data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
+        if (data_size <= 0) {
+                ret = -EINVAL;
+        } else {
+                ret = strict_strtol(id_str, 10, &id_long);
+                *id = (__u32)id_long;
+        }
+        return ret;
+}
+int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+{
+        return nfs_idmap_lookup_id(name, namelen, "uid", uid);
+}
+int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
+{
+        return nfs_idmap_lookup_id(name, namelen, "gid", gid);
+}
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+{
+        return nfs_idmap_lookup_name(uid, "user", buf, buflen);
+}
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
+{
+        return nfs_idmap_lookup_name(gid, "group", buf, buflen);
+}
+#else  /* CONFIG_NFS_USE_IDMAPPER not defined */
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
@@ -503,16 +709,17 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele
        return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf)
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
        struct idmap *idmap = clp->cl_idmap;
        return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf)
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
        struct idmap *idmap = clp->cl_idmap;
        return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
 }
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7d2d6c72aa78..314f57164602 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,6 +48,7 @@
 #include "internal.h"
 #include "fscache.h"
 #include "dns_resolve.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
@@ -234,9 +235,6 @@ nfs_init_locked(struct inode *inode, void *opaque)
        return 0;
 }
-/* Don't use READDIRPLUS on directories that we believe are too large */
-#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE)
 /*
 * This is our front-end to iget that looks up inodes by file handle
 * instead of inode number.
@@ -291,8 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                } else if (S_ISDIR(inode->i_mode)) {
                        inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
                        inode->i_fop = &nfs_dir_operations;
-                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
+                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
-                            && fattr->size <= NFS_LIMIT_READDIRPLUS)
                                set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        /* Deal with crossing mountpoints */
                        if ((fattr->valid & NFS_ATTR_FATTR_FSID)
@@ -623,7 +620,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
        nfs_revalidate_inode(server, inode);
 }
-static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred)
+struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode)
 {
        struct nfs_open_context *ctx;
@@ -633,11 +630,13 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
                path_get(&ctx->path);
                ctx->cred = get_rpccred(cred);
                ctx->state = NULL;
+                ctx->mode = f_mode;
                ctx->flags = 0;
                ctx->error = 0;
                ctx->dir_cookie = 0;
                nfs_init_lock_context(&ctx->lock_context);
                ctx->lock_context.open_context = ctx;
+                INIT_LIST_HEAD(&ctx->list);
        }
        return ctx;
 }
@@ -653,11 +652,15 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
        struct inode *inode = ctx->path.dentry->d_inode;
-        if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+        if (!list_empty(&ctx->list)) {
+                if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+                        return;
+                list_del(&ctx->list);
+                spin_unlock(&inode->i_lock);
+        } else if (!atomic_dec_and_test(&ctx->lock_context.count))
                return;
-        list_del(&ctx->list);
+        if (inode != NULL)
-        spin_unlock(&inode->i_lock);
+                NFS_PROTO(inode)->close_context(ctx, is_sync);
-        NFS_PROTO(inode)->close_context(ctx, is_sync);
        if (ctx->cred != NULL)
                put_rpccred(ctx->cred);
        path_put(&ctx->path);
@@ -673,7 +676,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
 * Ensure that mmap has a recent RPC credential for use when writing out
 * shared pages
 */
-static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -730,11 +733,10 @@ int nfs_open(struct inode *inode, struct file *filp)
        cred = rpc_lookup_cred();
        if (IS_ERR(cred))
                return PTR_ERR(cred);
-        ctx = alloc_nfs_open_context(&filp->f_path, cred);
+        ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode);
        put_rpccred(cred);
        if (ctx == NULL)
                return -ENOMEM;
-        ctx->mode = filp->f_mode;
        nfs_file_set_open_context(filp, ctx);
        put_nfs_open_context(ctx);
        nfs_fscache_set_inode_cookie(inode, filp);
@@ -1409,6 +1411,7 @@ void nfs4_evict_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
+        pnfs_destroy_layout(NFS_I(inode));
        /* If we are holding a delegation, return it! */
        nfs_inode_return_delegation_noreclaim(inode);
        /* First call standard NFS clear_inode() code */
@@ -1446,6 +1449,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
        nfsi->delegation = NULL;
        nfsi->delegation_state = 0;
        init_rwsem(&nfsi->rwsem);
+        nfsi->layout = NULL;
 #endif
 }
@@ -1493,7 +1497,7 @@ static int nfsiod_start(void)
 {
        struct workqueue_struct *wq;
        dprintk("RPC:       creating workqueue nfsiod\n");
-        wq = create_singlethread_workqueue("nfsiod");
+        wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
        if (wq == NULL)
                return -ENOMEM;
        nfsiod_workqueue = wq;
@@ -1521,6 +1525,10 @@ static int __init init_nfs_fs(void)
 {
        int err;
+        err = nfs_idmap_init();
+        if (err < 0)
+                goto out9;
        err = nfs_dns_resolver_init();
        if (err < 0)
                goto out8;
@@ -1585,6 +1593,8 @@ out6:
 out7:
        nfs_dns_resolver_destroy();
 out8:
+        nfs_idmap_quit();
+out9:
        return err;
 }
@@ -1597,6 +1607,7 @@ static void __exit exit_nfs_fs(void)
        nfs_destroy_nfspagecache();
        nfs_fscache_unregister();
        nfs_dns_resolver_destroy();
+        nfs_idmap_quit();
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c961bc92c107..db08ff3ff454 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,12 @@ struct nfs_clone_mount {
 #define NFS_UNSPEC_PORT         (-1)
 /*
+ * Maximum number of pages that readdir can use for creating
+ * a vmapped array of pages.
+ */
+#define NFS_MAX_READDIR_PAGES 8
+/*
 * In-kernel mount arguments
 */
 struct nfs_parsed_mount_data {
@@ -181,15 +187,15 @@ extern void nfs_destroy_directcache(void);
 /* nfs2xdr.c */
 extern int nfs_stat_to_errno(int);
 extern struct rpc_procinfo nfs_procedures[];
-extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
+extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
+extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 #endif
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 59047f8d7d72..eceafe74f473 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -153,6 +153,7 @@ int nfs_mount(struct nfs_mount_request *info)
                .rpc_resp       = &result,
        };
        struct rpc_create_args args = {
+                .net            = &init_net,
                .protocol       = info->protocol,
                .address        = info->sap,
                .addrsize       = info->salen,
@@ -224,6 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info)
                .to_retries = 2,
        };
        struct rpc_create_args args = {
+                .net            = &init_net,
                .protocol       = IPPROTO_UDP,
                .address        = info->sap,
                .addrsize       = info->salen,
@@ -436,7 +438,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
        for (i = 0; i < entries; i++) {
                flavors[i] = ntohl(*p++);
-                dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]);
+                dprintk("NFS:   auth flavor[%u]: %d\n", i, flavors[i]);
        }
        *count = i;
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index db8846a0e82e..e6bf45710cc7 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -337,10 +337,10 @@ nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
 static int
 nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        p = xdr_encode_fhandle(p, args->old_dir);
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
+        p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
-        p = xdr_encode_fhandle(p, args->tofh);
+        p = xdr_encode_fhandle(p, args->new_dir);
-        p = xdr_encode_array(p, args->toname, args->tolen);
+        p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
        return 0;
 }
@@ -423,9 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
        struct page **page;
        size_t hdrlen;
        unsigned int pglen, recvd;
-        u32 len;
        int status, nr = 0;
-        __be32 *end, *entry, *kaddr;
        if ((status = ntohl(*p++)))
                return nfs_stat_to_errno(status);
@@ -445,80 +443,59 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
        if (pglen > recvd)
                pglen = recvd;
        page = rcvbuf->pages;
-        kaddr = p = kmap_atomic(*page, KM_USER0);
-        end = (__be32 *)((char *)p + pglen);
-        entry = p;
-        /* Make sure the packet actually has a value_follows and EOF entry */
-        if ((entry + 1) > end)
-                goto short_pkt;
-        for (; *p++; nr++) {
-                if (p + 2 > end)
-                        goto short_pkt;
-                p++; /* fileid */
-                len = ntohl(*p++);
-                p += XDR_QUADLEN(len) + 1;      /* name plus cookie */
-                if (len > NFS2_MAXNAMLEN) {
-                        dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
-                                                len);
-                        goto err_unmap;
-                }
-                if (p + 2 > end)
-                        goto short_pkt;
-                entry = p;
-        }
-        /*
-         * Apparently some server sends responses that are a valid size, but
-         * contain no entries, and have value_follows==0 and EOF==0. For
-         * those, just set the EOF marker.
-         */
-        if (!nr && entry[1] == 0) {
-                dprintk("NFS: readdir reply truncated!\n");
-                entry[1] = 1;
-        }
- out:
-        kunmap_atomic(kaddr, KM_USER0);
        return nr;
- short_pkt:
+}
-        /*
-         * When we get a short packet there are 2 possibilities. We can
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-         * return an error, or fix up the response to look like a valid
+{
-         * response and return what we have so far. If there are no
+        dprintk("nfs: %s: prematurely hit end of receive buffer. "
-         * entries and the packet was short, then return -EIO. If there
+                "Remaining buffer length is %tu words.\n",
-         * are valid entries in the response, return them and pretend that
+                func, xdr->end - xdr->p);
-         * the call was successful, but incomplete. The caller can retry the
-         * readdir starting at the last cookie.
-         */
-        entry[0] = entry[1] = 0;
-        if (!nr)
-                nr = -errno_NFSERR_IO;
-        goto out;
-err_unmap:
-        nr = -errno_NFSERR_IO;
-        goto out;
 }
 __be32 *
-nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
 {
-        if (!*p++) {
+        __be32 *p;
-                if (!*p)
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        if (!ntohl(*p++)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                if (!ntohl(*p++))
                        return ERR_PTR(-EAGAIN);
                entry->eof = 1;
                return ERR_PTR(-EBADCOOKIE);
        }
+        p = xdr_inline_decode(xdr, 8);
+        if (unlikely(!p))
+                goto out_overflow;
        entry->ino        = ntohl(*p++);
        entry->len        = ntohl(*p++);
+        p = xdr_inline_decode(xdr, entry->len + 4);
+        if (unlikely(!p))
+                goto out_overflow;
        entry->name       = (const char *) p;
        p                += XDR_QUADLEN(entry->len);
        entry->prev_cookie        = entry->cookie;
        entry->cookie     = ntohl(*p++);
-        entry->eof        = !p[0] && p[1];
+        p = xdr_inline_peek(xdr, 8);
+        if (p != NULL)
+                entry->eof = !p[0] && p[1];
+        else
+                entry->eof = 0;
        return p;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return ERR_PTR(-EIO);
 }
 /*
@@ -596,7 +573,6 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
        struct kvec *iov = rcvbuf->head;
        size_t hdrlen;
        u32 len, recvd;
-        char    *kaddr;
        int     status;
        if ((status = ntohl(*p++)))
@@ -623,10 +599,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
                return -EIO;
        }
-        /* NULL terminate the string we got */
+        xdr_terminate_string(rcvbuf, len);
-        kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0);
-        kaddr[len+rcvbuf->page_base] = '\0';
-        kunmap_atomic(kaddr, KM_USER0);
        return 0;
 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index fabb4f2849a1..ce939c062a52 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -313,7 +313,7 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
 */
 static int
 nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-                 int flags, struct nameidata *nd)
+                 int flags, struct nfs_open_context *ctx)
 {
        struct nfs3_createdata *data;
        mode_t mode = sattr->ia_mode;
@@ -438,19 +438,38 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        return 1;
 }
+static void
+nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+        msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
+}
+static int
+nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+                      struct inode *new_dir)
+{
+        struct nfs_renameres *res;
+        if (nfs3_async_handle_jukebox(task, old_dir))
+                return 0;
+        res = task->tk_msg.rpc_resp;
+        nfs_post_op_update_inode(old_dir, res->old_fattr);
+        nfs_post_op_update_inode(new_dir, res->new_fattr);
+        return 1;
+}
 static int
 nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
                 struct inode *new_dir, struct qstr *new_name)
 {
-        struct nfs3_renameargs  arg = {
+        struct nfs_renameargs   arg = {
-                .fromfh         = NFS_FH(old_dir),
+                .old_dir        = NFS_FH(old_dir),
-                .fromname       = old_name->name,
+                .old_name       = old_name,
-                .fromlen        = old_name->len,
+                .new_dir        = NFS_FH(new_dir),
-                .tofh           = NFS_FH(new_dir),
+                .new_name       = new_name,
-                .toname         = new_name->name,
-                .tolen          = new_name->len
        };
-        struct nfs3_renameres res;
+        struct nfs_renameres res;
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_RENAME],
                .rpc_argp       = &arg,
@@ -460,17 +479,17 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
        dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
-        res.fromattr = nfs_alloc_fattr();
+        res.old_fattr = nfs_alloc_fattr();
-        res.toattr = nfs_alloc_fattr();
+        res.new_fattr = nfs_alloc_fattr();
-        if (res.fromattr == NULL || res.toattr == NULL)
+        if (res.old_fattr == NULL || res.new_fattr == NULL)
                goto out;
        status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
-        nfs_post_op_update_inode(old_dir, res.fromattr);
+        nfs_post_op_update_inode(old_dir, res.old_fattr);
-        nfs_post_op_update_inode(new_dir, res.toattr);
+        nfs_post_op_update_inode(new_dir, res.new_fattr);
 out:
-        nfs_free_fattr(res.toattr);
+        nfs_free_fattr(res.old_fattr);
-        nfs_free_fattr(res.fromattr);
+        nfs_free_fattr(res.new_fattr);
        dprintk("NFS reply rename: %d\n", status);
        return status;
 }
@@ -611,7 +630,7 @@ out:
 */
 static int
 nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                  u64 cookie, struct page *page, unsigned int count, int plus)
+                  u64 cookie, struct page **pages, unsigned int count, int plus)
 {
        struct inode            *dir = dentry->d_inode;
        __be32                  *verf = NFS_COOKIEVERF(dir);
@@ -621,7 +640,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                .verf           = {verf[0], verf[1]},
                .plus           = plus,
                .count          = count,
-                .pages          = &page
+                .pages          = pages
        };
        struct nfs3_readdirres  res = {
                .verf           = verf,
@@ -652,7 +671,8 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
        nfs_free_fattr(res.dir_attr);
 out:
-        dprintk("NFS reply readdir: %d\n", status);
+        dprintk("NFS reply readdir%s: %d\n",
+                        plus? "plus" : "", status);
        return status;
 }
@@ -722,7 +742,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
        dprintk("NFS call  fsstat\n");
        nfs_fattr_init(stat->fattr);
        status = rpc_call_sync(server->client, &msg, 0);
-        dprintk("NFS reply statfs: %d\n", status);
+        dprintk("NFS reply fsstat: %d\n", status);
        return status;
 }
@@ -844,6 +864,8 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .unlink_setup   = nfs3_proc_unlink_setup,
        .unlink_done    = nfs3_proc_unlink_done,
        .rename         = nfs3_proc_rename,
+        .rename_setup   = nfs3_proc_rename_setup,
+        .rename_done    = nfs3_proc_rename_done,
        .link           = nfs3_proc_link,
        .symlink        = nfs3_proc_symlink,
        .mkdir          = nfs3_proc_mkdir,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9769704f8ce6..d9a5e832c257 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -100,6 +100,13 @@ static const umode_t nfs_type2fmt[] = {
        [NF3FIFO] = S_IFIFO,
 };
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+        dprintk("nfs: %s: prematurely hit end of receive buffer. "
+                "Remaining buffer length is %tu words.\n",
+                func, xdr->end - xdr->p);
+}
 /*
 * Common NFS XDR functions as inlines
 */
@@ -119,6 +126,29 @@ xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
        return NULL;
 }
+static inline __be32 *
+xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        fh->size = ntohl(*p++);
+        if (fh->size <= NFS3_FHSIZE) {
+                p = xdr_inline_decode(xdr, fh->size);
+                if (unlikely(!p))
+                        goto out_overflow;
+                memcpy(fh->data, p, fh->size);
+                return p + XDR_QUADLEN(fh->size);
+        }
+        return NULL;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return ERR_PTR(-EIO);
+}
 /*
 * Encode/decode time.
 */
@@ -241,6 +271,26 @@ xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
 }
 static inline __be32 *
+xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        if (ntohl(*p++)) {
+                p = xdr_inline_decode(xdr, 84);
+                if (unlikely(!p))
+                        goto out_overflow;
+                p = xdr_decode_fattr(p, fattr);
+        }
+        return p;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return ERR_PTR(-EIO);
+}
+static inline __be32 *
 xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
 {
        if (*p++)
@@ -442,12 +492,12 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
 * Encode RENAME arguments
 */
 static int
-nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args)
+nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        p = xdr_encode_fhandle(p, args->old_dir);
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
+        p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
-        p = xdr_encode_fhandle(p, args->tofh);
+        p = xdr_encode_fhandle(p, args->new_dir);
-        p = xdr_encode_array(p, args->toname, args->tolen);
+        p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
        return 0;
 }
@@ -504,9 +554,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
        struct kvec *iov = rcvbuf->head;
        struct page **page;
        size_t hdrlen;
-        u32 len, recvd, pglen;
+        u32 recvd, pglen;
        int status, nr = 0;
-        __be32 *entry, *end, *kaddr;
        status = ntohl(*p++);
        /* Decode post_op_attrs */
@@ -536,99 +585,38 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
        if (pglen > recvd)
                pglen = recvd;
        page = rcvbuf->pages;
-        kaddr = p = kmap_atomic(*page, KM_USER0);
-        end = (__be32 *)((char *)p + pglen);
-        entry = p;
-        /* Make sure the packet actually has a value_follows and EOF entry */
-        if ((entry + 1) > end)
-                goto short_pkt;
-        for (; *p++; nr++) {
-                if (p + 3 > end)
-                        goto short_pkt;
-                p += 2;                         /* inode # */
-                len = ntohl(*p++);              /* string length */
-                p += XDR_QUADLEN(len) + 2;      /* name + cookie */
-                if (len > NFS3_MAXNAMLEN) {
-                        dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
-                                                len);
-                        goto err_unmap;
-                }
-                if (res->plus) {
-                        /* post_op_attr */
-                        if (p + 2 > end)
-                                goto short_pkt;
-                        if (*p++) {
-                                p += 21;
-                                if (p + 1 > end)
-                                        goto short_pkt;
-                        }
-                        /* post_op_fh3 */
-                        if (*p++) {
-                                if (p + 1 > end)
-                                        goto short_pkt;
-                                len = ntohl(*p++);
-                                if (len > NFS3_FHSIZE) {
-                                        dprintk("NFS: giant filehandle in "
-                                                "readdir (len 0x%x)!\n", len);
-                                        goto err_unmap;
-                                }
-                                p += XDR_QUADLEN(len);
-                        }
-                }
-                if (p + 2 > end)
-                        goto short_pkt;
-                entry = p;
-        }
-        /*
-         * Apparently some server sends responses that are a valid size, but
-         * contain no entries, and have value_follows==0 and EOF==0. For
-         * those, just set the EOF marker.
-         */
-        if (!nr && entry[1] == 0) {
-                dprintk("NFS: readdir reply truncated!\n");
-                entry[1] = 1;
-        }
- out:
-        kunmap_atomic(kaddr, KM_USER0);
        return nr;
- short_pkt:
-        /*
-         * When we get a short packet there are 2 possibilities. We can
-         * return an error, or fix up the response to look like a valid
-         * response and return what we have so far. If there are no
-         * entries and the packet was short, then return -EIO. If there
-         * are valid entries in the response, return them and pretend that
-         * the call was successful, but incomplete. The caller can retry the
-         * readdir starting at the last cookie.
-         */
-        entry[0] = entry[1] = 0;
-        if (!nr)
-                nr = -errno_NFSERR_IO;
-        goto out;
-err_unmap:
-        nr = -errno_NFSERR_IO;
-        goto out;
 }
 __be32 *
-nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
 {
+        __be32 *p;
        struct nfs_entry old = *entry;
-        if (!*p++) {
+        p = xdr_inline_decode(xdr, 4);
-                if (!*p)
+        if (unlikely(!p))
+                goto out_overflow;
+        if (!ntohl(*p++)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                if (!ntohl(*p++))
                        return ERR_PTR(-EAGAIN);
                entry->eof = 1;
                return ERR_PTR(-EBADCOOKIE);
        }
+        p = xdr_inline_decode(xdr, 12);
+        if (unlikely(!p))
+                goto out_overflow;
        p = xdr_decode_hyper(p, &entry->ino);
        entry->len  = ntohl(*p++);
+        p = xdr_inline_decode(xdr, entry->len + 8);
+        if (unlikely(!p))
+                goto out_overflow;
        entry->name = (const char *) p;
        p += XDR_QUADLEN(entry->len);
        entry->prev_cookie = entry->cookie;
@@ -636,10 +624,17 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
        if (plus) {
                entry->fattr->valid = 0;
-                p = xdr_decode_post_op_attr(p, entry->fattr);
+                p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
+                if (IS_ERR(p))
+                        goto out_overflow_exit;
                /* In fact, a post_op_fh3: */
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
                if (*p++) {
-                        p = xdr_decode_fhandle(p, entry->fh);
+                        p = xdr_decode_fhandle_stream(xdr, entry->fh);
+                        if (IS_ERR(p))
+                                goto out_overflow_exit;
                        /* Ugh -- server reply was truncated */
                        if (p == NULL) {
                                dprintk("NFS: FH truncated\n");
@@ -650,8 +645,18 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
                        memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
        }
-        entry->eof = !p[0] && p[1];
+        p = xdr_inline_peek(xdr, 8);
+        if (p != NULL)
+                entry->eof = !p[0] && p[1];
+        else
+                entry->eof = 0;
        return p;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+out_overflow_exit:
+        return ERR_PTR(-EIO);
 }
 /*
@@ -824,7 +829,6 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
        struct kvec *iov = rcvbuf->head;
        size_t hdrlen;
        u32 len, recvd;
-        char    *kaddr;
        int     status;
        status = ntohl(*p++);
@@ -857,10 +861,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
                return -EIO;
        }
-        /* NULL terminate the string we got */
+        xdr_terminate_string(rcvbuf, len);
-        kaddr = (char*)kmap_atomic(rcvbuf->pages[0], KM_USER0);
-        kaddr[len+rcvbuf->page_base] = '\0';
-        kunmap_atomic(kaddr, KM_USER0);
        return 0;
 }
@@ -970,14 +971,14 @@ nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
 * Decode RENAME reply
 */
 static int
-nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res)
+nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
 {
        int     status;
        if ((status = ntohl(*p++)) != 0)
                status = nfs_stat_to_errno(status);
-        p = xdr_decode_wcc_data(p, res->fromattr);
+        p = xdr_decode_wcc_data(p, res->old_fattr);
-        p = xdr_decode_wcc_data(p, res->toattr);
+        p = xdr_decode_wcc_data(p, res->new_fattr);
        return status;
 }
@@ -1043,8 +1044,9 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
        res->wtmult = ntohl(*p++);
        res->dtpref = ntohl(*p++);
        p = xdr_decode_hyper(p, &res->maxfilesize);
+        p = xdr_decode_time3(p, &res->time_delta);
-        /* ignore time_delta and properties */
+        /* ignore properties */
        res->lease_time = 0;
        return 0;
 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 311e15cc8af0..9fa496387fdf 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -242,8 +242,6 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
-extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
-extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
@@ -333,7 +331,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 /* nfs4xdr.c */
-extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 extern struct rpc_procinfo nfs4_procedures[];
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
new file mode 100644
index 000000000000..2e92f0d8d654
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.c
@@ -0,0 +1,280 @@
+/*
+ *  Module for the pnfs nfs4 file layout driver.
+ *  Defines all I/O and Policy interface operations, plus code
+ *  to register itself with the pNFS client.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "nfs4filelayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
+MODULE_DESCRIPTION("The NFSv4 file layout driver");
+static int
+filelayout_set_layoutdriver(struct nfs_server *nfss)
+{
+        int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
+                                                nfs4_fl_free_deviceid_callback);
+        if (status) {
+                printk(KERN_WARNING "%s: deviceid cache could not be "
+                        "initialized\n", __func__);
+                return status;
+        }
+        dprintk("%s: deviceid cache has been initialized successfully\n",
+                __func__);
+        return 0;
+}
+/* Clear out the layout by destroying its device list */
+static int
+filelayout_clear_layoutdriver(struct nfs_server *nfss)
+{
+        dprintk("--> %s\n", __func__);
+        if (nfss->nfs_client->cl_devid_cache)
+                pnfs_put_deviceid_cache(nfss->nfs_client);
+        return 0;
+}
+/*
+ * filelayout_check_layout()
+ *
+ * Make sure layout segment parameters are sane WRT the device.
+ * At this point no generic layer initialization of the lseg has occurred,
+ * and nothing has been added to the layout_hdr cache.
+ *
+ */
+static int
+filelayout_check_layout(struct pnfs_layout_hdr *lo,
+                        struct nfs4_filelayout_segment *fl,
+                        struct nfs4_layoutget_res *lgr,
+                        struct nfs4_deviceid *id)
+{
+        struct nfs4_file_layout_dsaddr *dsaddr;
+        int status = -EINVAL;
+        struct nfs_server *nfss = NFS_SERVER(lo->inode);
+        dprintk("--> %s\n", __func__);
+        if (fl->pattern_offset > lgr->range.offset) {
+                dprintk("%s pattern_offset %lld to large\n",
+                                __func__, fl->pattern_offset);
+                goto out;
+        }
+        if (fl->stripe_unit % PAGE_SIZE) {
+                dprintk("%s Stripe unit (%u) not page aligned\n",
+                        __func__, fl->stripe_unit);
+                goto out;
+        }
+        /* find and reference the deviceid */
+        dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+        if (dsaddr == NULL) {
+                dsaddr = get_device_info(lo->inode, id);
+                if (dsaddr == NULL)
+                        goto out;
+        }
+        fl->dsaddr = dsaddr;
+        if (fl->first_stripe_index < 0 ||
+            fl->first_stripe_index >= dsaddr->stripe_count) {
+                dprintk("%s Bad first_stripe_index %d\n",
+                                __func__, fl->first_stripe_index);
+                goto out_put;
+        }
+        if ((fl->stripe_type == STRIPE_SPARSE &&
+            fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
+            (fl->stripe_type == STRIPE_DENSE &&
+            fl->num_fh != dsaddr->stripe_count)) {
+                dprintk("%s num_fh %u not valid for given packing\n",
+                        __func__, fl->num_fh);
+                goto out_put;
+        }
+        if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
+                dprintk("%s Stripe unit (%u) not aligned with rsize %u "
+                        "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
+                        nfss->wsize);
+        }
+        status = 0;
+out:
+        dprintk("--> %s returns %d\n", __func__, status);
+        return status;
+out_put:
+        pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+        goto out;
+}
+static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
+{
+        int i;
+        for (i = 0; i < fl->num_fh; i++) {
+                if (!fl->fh_array[i])
+                        break;
+                kfree(fl->fh_array[i]);
+        }
+        kfree(fl->fh_array);
+        fl->fh_array = NULL;
+}
+static void
+_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
+{
+        filelayout_free_fh_array(fl);
+        kfree(fl);
+}
+static int
+filelayout_decode_layout(struct pnfs_layout_hdr *flo,
+                         struct nfs4_filelayout_segment *fl,
+                         struct nfs4_layoutget_res *lgr,
+                         struct nfs4_deviceid *id)
+{
+        uint32_t *p = (uint32_t *)lgr->layout.buf;
+        uint32_t nfl_util;
+        int i;
+        dprintk("%s: set_layout_map Begin\n", __func__);
+        memcpy(id, p, sizeof(*id));
+        p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+        print_deviceid(id);
+        nfl_util = be32_to_cpup(p++);
+        if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
+                fl->commit_through_mds = 1;
+        if (nfl_util & NFL4_UFLG_DENSE)
+                fl->stripe_type = STRIPE_DENSE;
+        else
+                fl->stripe_type = STRIPE_SPARSE;
+        fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
+        fl->first_stripe_index = be32_to_cpup(p++);
+        p = xdr_decode_hyper(p, &fl->pattern_offset);
+        fl->num_fh = be32_to_cpup(p++);
+        dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
+                __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
+                fl->pattern_offset);
+        fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
+                               GFP_KERNEL);
+        if (!fl->fh_array)
+                return -ENOMEM;
+        for (i = 0; i < fl->num_fh; i++) {
+                /* Do we want to use a mempool here? */
+                fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
+                if (!fl->fh_array[i]) {
+                        filelayout_free_fh_array(fl);
+                        return -ENOMEM;
+                }
+                fl->fh_array[i]->size = be32_to_cpup(p++);
+                if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
+                        printk(KERN_ERR "Too big fh %d received %d\n",
+                               i, fl->fh_array[i]->size);
+                        filelayout_free_fh_array(fl);
+                        return -EIO;
+                }
+                memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
+                p += XDR_QUADLEN(fl->fh_array[i]->size);
+                dprintk("DEBUG: %s: fh len %d\n", __func__,
+                        fl->fh_array[i]->size);
+        }
+        return 0;
+}
+static struct pnfs_layout_segment *
+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
+                      struct nfs4_layoutget_res *lgr)
+{
+        struct nfs4_filelayout_segment *fl;
+        int rc;
+        struct nfs4_deviceid id;
+        dprintk("--> %s\n", __func__);
+        fl = kzalloc(sizeof(*fl), GFP_KERNEL);
+        if (!fl)
+                return NULL;
+        rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
+        if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
+                _filelayout_free_lseg(fl);
+                return NULL;
+        }
+        return &fl->generic_hdr;
+}
+static void
+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+        struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+        dprintk("--> %s\n", __func__);
+        pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
+                          &fl->dsaddr->deviceid);
+        _filelayout_free_lseg(fl);
+}
+static struct pnfs_layoutdriver_type filelayout_type = {
+        .id = LAYOUT_NFSV4_1_FILES,
+        .name = "LAYOUT_NFSV4_1_FILES",
+        .owner = THIS_MODULE,
+        .set_layoutdriver = filelayout_set_layoutdriver,
+        .clear_layoutdriver = filelayout_clear_layoutdriver,
+        .alloc_lseg              = filelayout_alloc_lseg,
+        .free_lseg               = filelayout_free_lseg,
+};
+static int __init nfs4filelayout_init(void)
+{
+        printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
+               __func__);
+        return pnfs_register_layoutdriver(&filelayout_type);
+}
+static void __exit nfs4filelayout_exit(void)
+{
+        printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
+               __func__);
+        pnfs_unregister_layoutdriver(&filelayout_type);
+}
+module_init(nfs4filelayout_init);
+module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
new file mode 100644
index 000000000000..bbf60dd2ab9d
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.h
@@ -0,0 +1,94 @@
+/*
+ *  NFSv4 file layout driver data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#ifndef FS_NFS_NFS4FILELAYOUT_H
+#define FS_NFS_NFS4FILELAYOUT_H
+#include "pnfs.h"
+/*
+ * Field testing shows we need to support upto 4096 stripe indices.
+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
+ * reasonable. This in turn means we support a maximum of 256
+ * RFC 5661 multipath_list4 structures.
+ */
+#define NFS4_PNFS_MAX_STRIPE_CNT 4096
+#define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */
+enum stripetype4 {
+        STRIPE_SPARSE = 1,
+        STRIPE_DENSE = 2
+};
+/* Individual ip address */
+struct nfs4_pnfs_ds {
+        struct list_head        ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+        u32                     ds_ip_addr;
+        u32                     ds_port;
+        struct nfs_client       *ds_clp;
+        atomic_t                ds_count;
+};
+struct nfs4_file_layout_dsaddr {
+        struct pnfs_deviceid_node       deviceid;
+        u32                             stripe_count;
+        u8                              *stripe_indices;
+        u32                             ds_num;
+        struct nfs4_pnfs_ds             *ds_list[1];
+};
+struct nfs4_filelayout_segment {
+        struct pnfs_layout_segment generic_hdr;
+        u32 stripe_type;
+        u32 commit_through_mds;
+        u32 stripe_unit;
+        u32 first_stripe_index;
+        u64 pattern_offset;
+        struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
+        unsigned int num_fh;
+        struct nfs_fh **fh_array;
+};
+static inline struct nfs4_filelayout_segment *
+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+        return container_of(lseg,
+                            struct nfs4_filelayout_segment,
+                            generic_hdr);
+}
+extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
+extern void print_ds(struct nfs4_pnfs_ds *ds);
+extern void print_deviceid(struct nfs4_deviceid *dev_id);
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
+#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
new file mode 100644
index 000000000000..51fe64ace55a
--- /dev/null
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -0,0 +1,448 @@
+/*
+ *  Device operations for the pnfs nfs4 file layout driver.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+#include "nfs4filelayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ *   - set to 1 on allocation
+ *   - incremented when a device id maps a data server already in the cache.
+ *   - decremented when deviceid is removed from the cache.
+ */
+DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+/* Debug routines */
+void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+        if (ds == NULL) {
+                printk("%s NULL device\n", __func__);
+                return;
+        }
+        printk("        ip_addr %x port %hu\n"
+                "        ref count %d\n"
+                "        client %p\n"
+                "        cl_exchange_flags %x\n",
+                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+                atomic_read(&ds->ds_count), ds->ds_clp,
+                ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+void
+print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+        int i;
+        ifdebug(FACILITY) {
+                printk("%s dsaddr->ds_num %d\n", __func__,
+                       dsaddr->ds_num);
+                for (i = 0; i < dsaddr->ds_num; i++)
+                        print_ds(dsaddr->ds_list[i]);
+        }
+}
+void print_deviceid(struct nfs4_deviceid *id)
+{
+        u32 *p = (u32 *)id;
+        dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+                p[0], p[1], p[2], p[3]);
+}
+/* nfs4_ds_cache_lock is held */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(u32 ip_addr, u32 port)
+{
+        struct nfs4_pnfs_ds *ds;
+        dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
+                        ntohl(ip_addr), ntohs(port));
+        list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
+                if (ds->ds_ip_addr == ip_addr &&
+                    ds->ds_port == port) {
+                        return ds;
+                }
+        }
+        return NULL;
+}
+static void
+destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+        dprintk("--> %s\n", __func__);
+        ifdebug(FACILITY)
+                print_ds(ds);
+        if (ds->ds_clp)
+                nfs_put_client(ds->ds_clp);
+        kfree(ds);
+}
+static void
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+        struct nfs4_pnfs_ds *ds;
+        int i;
+        print_deviceid(&dsaddr->deviceid.de_id);
+        for (i = 0; i < dsaddr->ds_num; i++) {
+                ds = dsaddr->ds_list[i];
+                if (ds != NULL) {
+                        if (atomic_dec_and_lock(&ds->ds_count,
+                                                &nfs4_ds_cache_lock)) {
+                                list_del_init(&ds->ds_node);
+                                spin_unlock(&nfs4_ds_cache_lock);
+                                destroy_ds(ds);
+                        }
+                }
+        }
+        kfree(dsaddr->stripe_indices);
+        kfree(dsaddr);
+}
+void
+nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
+{
+        struct nfs4_file_layout_dsaddr *dsaddr =
+                container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
+        nfs4_fl_free_deviceid(dsaddr);
+}
+static struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
+{
+        struct nfs4_pnfs_ds *tmp_ds, *ds;
+        ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
+        if (!ds)
+                goto out;
+        spin_lock(&nfs4_ds_cache_lock);
+        tmp_ds = _data_server_lookup_locked(ip_addr, port);
+        if (tmp_ds == NULL) {
+                ds->ds_ip_addr = ip_addr;
+                ds->ds_port = port;
+                atomic_set(&ds->ds_count, 1);
+                INIT_LIST_HEAD(&ds->ds_node);
+                ds->ds_clp = NULL;
+                list_add(&ds->ds_node, &nfs4_data_server_cache);
+                dprintk("%s add new data server ip 0x%x\n", __func__,
+                        ds->ds_ip_addr);
+        } else {
+                kfree(ds);
+                atomic_inc(&tmp_ds->ds_count);
+                dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
+                        __func__, tmp_ds->ds_ip_addr,
+                        atomic_read(&tmp_ds->ds_count));
+                ds = tmp_ds;
+        }
+        spin_unlock(&nfs4_ds_cache_lock);
+out:
+        return ds;
+}
+/*
+ * Currently only support ipv4, and one multi-path address.
+ */
+static struct nfs4_pnfs_ds *
+decode_and_add_ds(__be32 **pp, struct inode *inode)
+{
+        struct nfs4_pnfs_ds *ds = NULL;
+        char *buf;
+        const char *ipend, *pstr;
+        u32 ip_addr, port;
+        int nlen, rlen, i;
+        int tmp[2];
+        __be32 *r_netid, *r_addr, *p = *pp;
+        /* r_netid */
+        nlen = be32_to_cpup(p++);
+        r_netid = p;
+        p += XDR_QUADLEN(nlen);
+        /* r_addr */
+        rlen = be32_to_cpup(p++);
+        r_addr = p;
+        p += XDR_QUADLEN(rlen);
+        *pp = p;
+        /* Check that netid is "tcp" */
+        if (nlen != 3 ||  memcmp((char *)r_netid, "tcp", 3)) {
+                dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
+                goto out_err;
+        }
+        /* ipv6 length plus port is legal */
+        if (rlen > INET6_ADDRSTRLEN + 8) {
+                dprintk("%s Invalid address, length %d\n", __func__,
+                        rlen);
+                goto out_err;
+        }
+        buf = kmalloc(rlen + 1, GFP_KERNEL);
+        buf[rlen] = '\0';
+        memcpy(buf, r_addr, rlen);
+        /* replace the port dots with dashes for the in4_pton() delimiter*/
+        for (i = 0; i < 2; i++) {
+                char *res = strrchr(buf, '.');
+                *res = '-';
+        }
+        /* Currently only support ipv4 address */
+        if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
+                dprintk("%s: Only ipv4 addresses supported\n", __func__);
+                goto out_free;
+        }
+        /* port */
+        pstr = ipend;
+        sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
+        port = htons((tmp[0] << 8) | (tmp[1]));
+        ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
+        dprintk("%s Decoded address and port %s\n", __func__, buf);
+out_free:
+        kfree(buf);
+out_err:
+        return ds;
+}
+/* Decode opaque device data and return the result */
+static struct nfs4_file_layout_dsaddr*
+decode_device(struct inode *ino, struct pnfs_device *pdev)
+{
+        int i, dummy;
+        u32 cnt, num;
+        u8 *indexp;
+        __be32 *p = (__be32 *)pdev->area, *indicesp;
+        struct nfs4_file_layout_dsaddr *dsaddr;
+        /* Get the stripe count (number of stripe index) */
+        cnt = be32_to_cpup(p++);
+        dprintk("%s stripe count  %d\n", __func__, cnt);
+        if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
+                printk(KERN_WARNING "%s: stripe count %d greater than "
+                       "supported maximum %d\n", __func__,
+                        cnt, NFS4_PNFS_MAX_STRIPE_CNT);
+                goto out_err;
+        }
+        /* Check the multipath list count */
+        indicesp = p;
+        p += XDR_QUADLEN(cnt << 2);
+        num = be32_to_cpup(p++);
+        dprintk("%s ds_num %u\n", __func__, num);
+        if (num > NFS4_PNFS_MAX_MULTI_CNT) {
+                printk(KERN_WARNING "%s: multipath count %d greater than "
+                        "supported maximum %d\n", __func__,
+                        num, NFS4_PNFS_MAX_MULTI_CNT);
+                goto out_err;
+        }
+        dsaddr = kzalloc(sizeof(*dsaddr) +
+                        (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
+                        GFP_KERNEL);
+        if (!dsaddr)
+                goto out_err;
+        dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
+        if (!dsaddr->stripe_indices)
+                goto out_err_free;
+        dsaddr->stripe_count = cnt;
+        dsaddr->ds_num = num;
+        memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+        /* Go back an read stripe indices */
+        p = indicesp;
+        indexp = &dsaddr->stripe_indices[0];
+        for (i = 0; i < dsaddr->stripe_count; i++) {
+                *indexp = be32_to_cpup(p++);
+                if (*indexp >= num)
+                        goto out_err_free;
+                indexp++;
+        }
+        /* Skip already read multipath list count */
+        p++;
+        for (i = 0; i < dsaddr->ds_num; i++) {
+                int j;
+                dummy = be32_to_cpup(p++); /* multipath count */
+                if (dummy > 1) {
+                        printk(KERN_WARNING
+                               "%s: Multipath count %d not supported, "
+                               "skipping all greater than 1\n", __func__,
+                                dummy);
+                }
+                for (j = 0; j < dummy; j++) {
+                        if (j == 0) {
+                                dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
+                                if (dsaddr->ds_list[i] == NULL)
+                                        goto out_err_free;
+                        } else {
+                                u32 len;
+                                /* skip extra multipath */
+                                len = be32_to_cpup(p++);
+                                p += XDR_QUADLEN(len);
+                                len = be32_to_cpup(p++);
+                                p += XDR_QUADLEN(len);
+                                continue;
+                        }
+                }
+        }
+        return dsaddr;
+out_err_free:
+        nfs4_fl_free_deviceid(dsaddr);
+out_err:
+        dprintk("%s ERROR: returning NULL\n", __func__);
+        return NULL;
+}
+/*
+ * Decode the opaque device specified in 'dev'
+ * and add it to the list of available devices.
+ * If the deviceid is already cached, nfs4_add_deviceid will return
+ * a pointer to the cached struct and throw away the new.
+ */
+static struct nfs4_file_layout_dsaddr*
+decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
+{
+        struct nfs4_file_layout_dsaddr *dsaddr;
+        struct pnfs_deviceid_node *d;
+        dsaddr = decode_device(inode, dev);
+        if (!dsaddr) {
+                printk(KERN_WARNING "%s: Could not decode or add device\n",
+                        __func__);
+                return NULL;
+        }
+        d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+                              &dsaddr->deviceid);
+        return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
+/*
+ * Retrieve the information for dev_id, add it to the list
+ * of available devices, and return it.
+ */
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
+{
+        struct pnfs_device *pdev = NULL;
+        u32 max_resp_sz;
+        int max_pages;
+        struct page **pages = NULL;
+        struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+        int rc, i;
+        struct nfs_server *server = NFS_SERVER(inode);
+        /*
+         * Use the session max response size as the basis for setting
+         * GETDEVICEINFO's maxcount
+         */
+        max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+        max_pages = max_resp_sz >> PAGE_SHIFT;
+        dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
+                __func__, inode, max_resp_sz, max_pages);
+        pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL);
+        if (pdev == NULL)
+                return NULL;
+        pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
+        if (pages == NULL) {
+                kfree(pdev);
+                return NULL;
+        }
+        for (i = 0; i < max_pages; i++) {
+                pages[i] = alloc_page(GFP_KERNEL);
+                if (!pages[i])
+                        goto out_free;
+        }
+        /* set pdev->area */
+        pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
+        if (!pdev->area)
+                goto out_free;
+        memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+        pdev->layout_type = LAYOUT_NFSV4_1_FILES;
+        pdev->pages = pages;
+        pdev->pgbase = 0;
+        pdev->pglen = PAGE_SIZE * max_pages;
+        pdev->mincount = 0;
+        rc = nfs4_proc_getdeviceinfo(server, pdev);
+        dprintk("%s getdevice info returns %d\n", __func__, rc);
+        if (rc)
+                goto out_free;
+        /*
+         * Found new device, need to decode it and then add it to the
+         * list of known devices for this mountpoint.
+         */
+        dsaddr = decode_and_add_device(inode, pdev);
+out_free:
+        if (pdev->area != NULL)
+                vunmap(pdev->area);
+        for (i = 0; i < max_pages; i++)
+                __free_page(pages[i]);
+        kfree(pages);
+        kfree(pdev);
+        dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
+        return dsaddr;
+}
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
+{
+        struct pnfs_deviceid_node *d;
+        d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
+        return (d == NULL) ? NULL :
+                container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 089da5b5d20a..0f24cdf2cb13 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -55,6 +55,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "callback.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PROC
@@ -129,7 +130,8 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
                        | FATTR4_WORD0_MAXREAD
                        | FATTR4_WORD0_MAXWRITE
                        | FATTR4_WORD0_LEASE_TIME,
-                        0
+                        FATTR4_WORD1_TIME_DELTA
+                        | FATTR4_WORD1_FS_LAYOUT_TYPES
 };
 const u32 nfs4_fs_locations_bitmap[2] = {
@@ -255,9 +257,6 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                        nfs4_state_mark_reclaim_nograce(clp, state);
                        goto do_state_recovery;
                case -NFS4ERR_STALE_STATEID:
-                        if (state == NULL)
-                                break;
-                        nfs4_state_mark_reclaim_reboot(clp, state);
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
                        goto do_state_recovery;
@@ -334,10 +333,12 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 * Must be called while holding tbl->slot_tbl_lock
 */
 static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
+nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
 {
+        int free_slotid = free_slot - tbl->slots;
        int slotid = free_slotid;
+        BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
        /* clear used bit in bitmap */
        __clear_bit(slotid, tbl->used_slots);
@@ -379,7 +380,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
        struct nfs4_slot_table *tbl;
        tbl = &res->sr_session->fc_slot_table;
-        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
+        if (!res->sr_slot) {
                /* just wake up the next guy waiting since
                 * we may have not consumed a slot after all */
                dprintk("%s: No slot\n", __func__);
@@ -387,17 +388,15 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
        }
        spin_lock(&tbl->slot_tbl_lock);
-        nfs4_free_slot(tbl, res->sr_slotid);
+        nfs4_free_slot(tbl, res->sr_slot);
        nfs41_check_drain_session_complete(res->sr_session);
        spin_unlock(&tbl->slot_tbl_lock);
-        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        res->sr_slot = NULL;
 }
 static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 {
        unsigned long timestamp;
-        struct nfs4_slot_table *tbl;
-        struct nfs4_slot *slot;
        struct nfs_client *clp;
        /*
@@ -410,17 +409,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
                res->sr_status = NFS_OK;
        /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */
-        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
+        if (!res->sr_slot)
                goto out;
-        tbl = &res->sr_session->fc_slot_table;
-        slot = tbl->slots + res->sr_slotid;
        /* Check the SEQUENCE operation status */
        switch (res->sr_status) {
        case 0:
                /* Update the slot's sequence and clientid lease timer */
-                ++slot->seq_nr;
+                ++res->sr_slot->seq_nr;
                timestamp = res->sr_renewal_time;
                clp = res->sr_session->clp;
                do_renew_lease(clp, timestamp);
@@ -433,12 +429,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
                 * returned NFS4ERR_DELAY as per Section 2.10.6.2
                 * of RFC5661.
                 */
-                dprintk("%s: slot=%d seq=%d: Operation in progress\n",
+                dprintk("%s: slot=%td seq=%d: Operation in progress\n",
-                                __func__, res->sr_slotid, slot->seq_nr);
+                        __func__,
+                        res->sr_slot - res->sr_session->fc_slot_table.slots,
+                        res->sr_slot->seq_nr);
                goto out_retry;
        default:
                /* Just update the slot sequence no. */
-                ++slot->seq_nr;
+                ++res->sr_slot->seq_nr;
        }
 out:
        /* The session may be reset by one of the error handlers. */
@@ -505,10 +503,9 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        dprintk("--> %s\n", __func__);
        /* slot already allocated? */
-        if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
+        if (res->sr_slot != NULL)
                return 0;
-        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
        tbl = &session->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
@@ -550,7 +547,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
        res->sr_session = session;
-        res->sr_slotid = slotid;
+        res->sr_slot = slot;
        res->sr_renewal_time = jiffies;
        res->sr_status_flags = 0;
        /*
@@ -576,8 +573,9 @@ int nfs4_setup_sequence(const struct nfs_server *server,
                goto out;
        }
-        dprintk("--> %s clp %p session %p sr_slotid %d\n",
+        dprintk("--> %s clp %p session %p sr_slot %td\n",
-                __func__, session->clp, session, res->sr_slotid);
+                __func__, session->clp, session, res->sr_slot ?
+                        res->sr_slot - session->fc_slot_table.slots : -1);
        ret = nfs41_setup_sequence(session, args, res, cache_reply,
                                   task);
@@ -650,7 +648,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,
                .callback_data = &data
        };
-        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        res->sr_slot = NULL;
        if (privileged)
                task_setup.callback_ops = &nfs41_call_priv_sync_ops;
        task = rpc_run_task(&task_setup);
@@ -735,7 +733,6 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
        p->o_res.server = p->o_arg.server;
        nfs_fattr_init(&p->f_attr);
        nfs_fattr_init(&p->dir_attr);
-        p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
@@ -1120,6 +1117,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
        clear_bit(NFS_DELEGATED_STATE, &state->flags);
        smp_rmb();
        if (state->n_rdwr != 0) {
+                clear_bit(NFS_O_RDWR_STATE, &state->flags);
                ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
                if (ret != 0)
                        return ret;
@@ -1127,6 +1125,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
                        return -ESTALE;
        }
        if (state->n_wronly != 0) {
+                clear_bit(NFS_O_WRONLY_STATE, &state->flags);
                ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
                if (ret != 0)
                        return ret;
@@ -1134,6 +1133,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
                        return -ESTALE;
        }
        if (state->n_rdonly != 0) {
+                clear_bit(NFS_O_RDONLY_STATE, &state->flags);
                ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
                if (ret != 0)
                        return ret;
@@ -1188,7 +1188,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
        int err;
        do {
                err = _nfs4_do_open_reclaim(ctx, state);
-                if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
+                if (err != -NFS4ERR_DELAY)
                        break;
                nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
@@ -1258,6 +1258,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                        case -EKEYEXPIRED:
+                                /*
+                                 * User RPCSEC_GSS context has expired.
+                                 * We cannot recover this stateid now, so
+                                 * skip it and allow recovery thread to
+                                 * proceed.
+                                 */
                        case -ENOMEM:
                                err = 0;
                                goto out;
@@ -1605,7 +1612,6 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
                        goto out;
                case -NFS4ERR_GRACE:
                case -NFS4ERR_DELAY:
-                case -EKEYEXPIRED:
                        nfs4_handle_exception(server, err, &exception);
                        err = 0;
                }
@@ -1975,7 +1981,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
-        calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        path_get(path);
        calldata->path = *path;
@@ -1998,120 +2003,17 @@ out:
        return status;
 }
-static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode)
+static struct inode *
+nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
 {
-        struct file *filp;
-        int ret;
-        /* If the open_intent is for execute, we have an extra check to make */
-        if (fmode & FMODE_EXEC) {
-                ret = nfs_may_open(state->inode,
-                                state->owner->so_cred,
-                                nd->intent.open.flags);
-                if (ret < 0)
-                        goto out_close;
-        }
-        filp = lookup_instantiate_filp(nd, path->dentry, NULL);
-        if (!IS_ERR(filp)) {
-                struct nfs_open_context *ctx;
-                ctx = nfs_file_open_context(filp);
-                ctx->state = state;
-                return 0;
-        }
-        ret = PTR_ERR(filp);
-out_close:
-        nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
-        return ret;
-}
-struct dentry *
-nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        struct path path = {
-                .mnt = nd->path.mnt,
-                .dentry = dentry,
-        };
-        struct dentry *parent;
-        struct iattr attr;
-        struct rpc_cred *cred;
        struct nfs4_state *state;
-        struct dentry *res;
-        int open_flags = nd->intent.open.flags;
-        fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
-        if (nd->flags & LOOKUP_CREATE) {
-                attr.ia_mode = nd->intent.open.create_mode;
-                attr.ia_valid = ATTR_MODE;
-                if (!IS_POSIXACL(dir))
-                        attr.ia_mode &= ~current_umask();
-        } else {
-                open_flags &= ~O_EXCL;
-                attr.ia_valid = 0;
-                BUG_ON(open_flags & O_CREAT);
-        }
-        cred = rpc_lookup_cred();
-        if (IS_ERR(cred))
-                return (struct dentry *)cred;
-        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
-        nfs_block_sillyrename(parent);
+        state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred);
-        state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred);
+        if (IS_ERR(state))
-        put_rpccred(cred);
+                return ERR_CAST(state);
-        if (IS_ERR(state)) {
+        ctx->state = state;
-                if (PTR_ERR(state) == -ENOENT) {
+        return igrab(state->inode);
-                        d_add(dentry, NULL);
-                        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-                }
-                nfs_unblock_sillyrename(parent);
-                return (struct dentry *)state;
-        }
-        res = d_add_unique(dentry, igrab(state->inode));
-        if (res != NULL)
-                path.dentry = res;
-        nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
-        nfs_unblock_sillyrename(parent);
-        nfs4_intent_set_file(nd, &path, state, fmode);
-        return res;
-}
-int
-nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd)
-{
-        struct path path = {
-                .mnt = nd->path.mnt,
-                .dentry = dentry,
-        };
-        struct rpc_cred *cred;
-        struct nfs4_state *state;
-        fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
-        cred = rpc_lookup_cred();
-        if (IS_ERR(cred))
-                return PTR_ERR(cred);
-        state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
-        put_rpccred(cred);
-        if (IS_ERR(state)) {
-                switch (PTR_ERR(state)) {
-                        case -EPERM:
-                        case -EACCES:
-                        case -EDQUOT:
-                        case -ENOSPC:
-                        case -EROFS:
-                                return PTR_ERR(state);
-                        default:
-                                goto out_drop;
-                }
-        }
-        if (state->inode == dentry->d_inode) {
-                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-                nfs4_intent_set_file(nd, &path, state, fmode);
-                return 1;
-        }
-        nfs4_close_sync(&path, state, fmode);
-out_drop:
-        d_drop(dentry);
-        return 0;
 }
 static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2568,36 +2470,34 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page,
 static int
 nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-                 int flags, struct nameidata *nd)
+                 int flags, struct nfs_open_context *ctx)
 {
-        struct path path = {
+        struct path my_path = {
-                .mnt = nd->path.mnt,
                .dentry = dentry,
        };
+        struct path *path = &my_path;
        struct nfs4_state *state;
-        struct rpc_cred *cred;
+        struct rpc_cred *cred = NULL;
-        fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE);
+        fmode_t fmode = 0;
        int status = 0;
-        cred = rpc_lookup_cred();
+        if (ctx != NULL) {
-        if (IS_ERR(cred)) {
+                cred = ctx->cred;
-                status = PTR_ERR(cred);
+                path = &ctx->path;
-                goto out;
+                fmode = ctx->mode;
        }
-        state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred);
+        state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
        d_drop(dentry);
        if (IS_ERR(state)) {
                status = PTR_ERR(state);
-                goto out_putcred;
+                goto out;
        }
        d_add(dentry, igrab(state->inode));
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-        if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
+        if (ctx != NULL)
-                status = nfs4_intent_set_file(nd, &path, state, fmode);
+                ctx->state = state;
        else
-                nfs4_close_sync(&path, state, fmode);
+                nfs4_close_sync(path, state, fmode);
-out_putcred:
-        put_rpccred(cred);
 out:
        return status;
 }
@@ -2655,6 +2555,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
        args->bitmask = server->cache_consistency_bitmask;
        res->server = server;
+        res->seq_res.sr_slot = NULL;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 }
@@ -2671,18 +2572,46 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        return 1;
 }
+static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+        struct nfs_server *server = NFS_SERVER(dir);
+        struct nfs_renameargs *arg = msg->rpc_argp;
+        struct nfs_renameres *res = msg->rpc_resp;
+        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
+        arg->bitmask = server->attr_bitmask;
+        res->server = server;
+}
+static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+                                 struct inode *new_dir)
+{
+        struct nfs_renameres *res = task->tk_msg.rpc_resp;
+        if (!nfs4_sequence_done(task, &res->seq_res))
+                return 0;
+        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+                return 0;
+        update_changeattr(old_dir, &res->old_cinfo);
+        nfs_post_op_update_inode(old_dir, res->old_fattr);
+        update_changeattr(new_dir, &res->new_cinfo);
+        nfs_post_op_update_inode(new_dir, res->new_fattr);
+        return 1;
+}
 static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
                struct inode *new_dir, struct qstr *new_name)
 {
        struct nfs_server *server = NFS_SERVER(old_dir);
-        struct nfs4_rename_arg arg = {
+        struct nfs_renameargs arg = {
                .old_dir = NFS_FH(old_dir),
                .new_dir = NFS_FH(new_dir),
                .old_name = old_name,
                .new_name = new_name,
                .bitmask = server->attr_bitmask,
        };
-        struct nfs4_rename_res res = {
+        struct nfs_renameres res = {
                .server = server,
        };
        struct rpc_message msg = {
@@ -2896,15 +2825,16 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 }
 static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                  u64 cookie, struct page *page, unsigned int count, int plus)
+                u64 cookie, struct page **pages, unsigned int count, int plus)
 {
        struct inode            *dir = dentry->d_inode;
        struct nfs4_readdir_arg args = {
                .fh = NFS_FH(dir),
-                .pages = &page,
+                .pages = pages,
                .pgbase = 0,
                .count = count,
                .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+                .plus = plus,
        };
        struct nfs4_readdir_res res;
        struct rpc_message msg = {
@@ -2932,14 +2862,14 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 }
 static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                  u64 cookie, struct page *page, unsigned int count, int plus)
+                u64 cookie, struct page **pages, unsigned int count, int plus)
 {
        struct nfs4_exception exception = { };
        int err;
        do {
                err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode),
                                _nfs4_proc_readdir(dentry, cred, cookie,
-                                        page, count, plus),
+                                        pages, count, plus),
                                &exception);
        } while (exception.retry);
        return err;
@@ -3490,9 +3420,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                        nfs4_state_mark_reclaim_nograce(clp, state);
                        goto do_state_recovery;
                case -NFS4ERR_STALE_STATEID:
-                        if (state == NULL)
-                                break;
-                        nfs4_state_mark_reclaim_reboot(clp, state);
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
                        goto do_state_recovery;
@@ -3626,7 +3553,6 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
                        case -NFS4ERR_RESOURCE:
                                /* The IBM lawyers misread another document! */
                        case -NFS4ERR_DELAY:
-                        case -EKEYEXPIRED:
                                err = nfs4_delay(clp->cl_rpcclient, &timeout);
                }
        } while (err == 0);
@@ -3721,7 +3647,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        memcpy(&data->stateid, stateid, sizeof(data->stateid));
        data->res.fattr = &data->fattr;
        data->res.server = server;
-        data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        nfs_fattr_init(data->res.fattr);
        data->timestamp = jiffies;
        data->rpc_status = 0;
@@ -3874,7 +3799,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
        p->arg.fl = &p->fl;
        p->arg.seqid = seqid;
        p->res.seqid = seqid;
-        p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        p->arg.stateid = &lsp->ls_stateid;
        p->lsp = lsp;
        atomic_inc(&lsp->ls_count);
@@ -4054,7 +3978,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
        p->arg.lock_owner.id = lsp->ls_id.id;
        p->res.lock_seqid = p->arg.lock_seqid;
-        p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        p->lsp = lsp;
        p->server = server;
        atomic_inc(&lsp->ls_count);
@@ -4241,7 +4164,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
                if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
                        return 0;
                err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
-                if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
+                if (err != -NFS4ERR_DELAY)
                        break;
                nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
@@ -4266,7 +4189,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
                        goto out;
                case -NFS4ERR_GRACE:
                case -NFS4ERR_DELAY:
-                case -EKEYEXPIRED:
                        nfs4_handle_exception(server, err, &exception);
                        err = 0;
                }
@@ -4412,13 +4334,21 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
                                err = 0;
                                goto out;
+                        case -EKEYEXPIRED:
+                                /*
+                                 * User RPCSEC_GSS context has expired.
+                                 * We cannot recover this stateid now, so
+                                 * skip it and allow recovery thread to
+                                 * proceed.
+                                 */
+                                err = 0;
+                                goto out;
                        case -ENOMEM:
                        case -NFS4ERR_DENIED:
                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
                                err = 0;
                                goto out;
                        case -NFS4ERR_DELAY:
-                        case -EKEYEXPIRED:
                                break;
                }
                err = nfs4_handle_exception(server, err, &exception);
@@ -4647,7 +4577,6 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
        switch (task->tk_status) {
        case -NFS4ERR_DELAY:
        case -NFS4ERR_GRACE:
-        case -EKEYEXPIRED:
                dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
                rpc_delay(task, NFS4_POLL_RETRY_MIN);
                task->tk_status = 0;
@@ -4687,7 +4616,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
        };
        int status;
-        res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        dprintk("--> %s\n", __func__);
        task = rpc_run_task(&task_setup);
@@ -4914,49 +4842,56 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
                args->bc_attrs.max_reqs);
 }
-static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd)
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
 {
-        if (rcvd <= sent)
+        struct nfs4_channel_attrs *sent = &args->fc_attrs;
-                return 0;
+        struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
-        printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. "
-                "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd);
+        if (rcvd->headerpadsz > sent->headerpadsz)
-        return -EINVAL;
+                return -EINVAL;
+        if (rcvd->max_resp_sz > sent->max_resp_sz)
+                return -EINVAL;
+        /*
+         * Our requested max_ops is the minimum we need; we're not
+         * prepared to break up compounds into smaller pieces than that.
+         * So, no point even trying to continue if the server won't
+         * cooperate:
+         */
+        if (rcvd->max_ops < sent->max_ops)
+                return -EINVAL;
+        if (rcvd->max_reqs == 0)
+                return -EINVAL;
+        return 0;
 }
-#define _verify_fore_channel_attr(_name_) \
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
-        _verify_channel_attr("fore", #_name_, \
+{
-                             args->fc_attrs._name_, \
+        struct nfs4_channel_attrs *sent = &args->bc_attrs;
-                             session->fc_attrs._name_)
+        struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
-#define _verify_back_channel_attr(_name_) \
+        if (rcvd->max_rqst_sz > sent->max_rqst_sz)
-        _verify_channel_attr("back", #_name_, \
+                return -EINVAL;
-                             args->bc_attrs._name_, \
+        if (rcvd->max_resp_sz < sent->max_resp_sz)
-                             session->bc_attrs._name_)
+                return -EINVAL;
+        if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
+                return -EINVAL;
+        /* These would render the backchannel useless: */
+        if (rcvd->max_ops  == 0)
+                return -EINVAL;
+        if (rcvd->max_reqs == 0)
+                return -EINVAL;
+        return 0;
+}
-/*
- * The server is not allowed to increase the fore channel header pad size,
- * maximum response size, or maximum number of operations.
- *
- * The back channel attributes are only negotiatied down: We send what the
- * (back channel) server insists upon.
- */
 static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
                                     struct nfs4_session *session)
 {
-        int ret = 0;
+        int ret;
-        ret |= _verify_fore_channel_attr(headerpadsz);
-        ret |= _verify_fore_channel_attr(max_resp_sz);
-        ret |= _verify_fore_channel_attr(max_ops);
-        ret |= _verify_back_channel_attr(headerpadsz);
-        ret |= _verify_back_channel_attr(max_rqst_sz);
-        ret |= _verify_back_channel_attr(max_resp_sz);
-        ret |= _verify_back_channel_attr(max_resp_sz_cached);
-        ret |= _verify_back_channel_attr(max_ops);
-        ret |= _verify_back_channel_attr(max_reqs);
-        return ret;
+        ret = nfs4_verify_fore_channel_attrs(args, session);
+        if (ret)
+                return ret;
+        return nfs4_verify_back_channel_attrs(args, session);
 }
 static int _nfs4_proc_create_session(struct nfs_client *clp)
@@ -5111,7 +5046,6 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
 {
        switch(task->tk_status) {
        case -NFS4ERR_DELAY:
-        case -EKEYEXPIRED:
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
@@ -5180,12 +5114,11 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
        if (!atomic_inc_not_zero(&clp->cl_count))
                return ERR_PTR(-EIO);
-        calldata = kmalloc(sizeof(*calldata), GFP_NOFS);
+        calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
        if (calldata == NULL) {
                nfs_put_client(clp);
                return ERR_PTR(-ENOMEM);
        }
-        calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        msg.rpc_argp = &calldata->args;
        msg.rpc_resp = &calldata->res;
        calldata->clp = clp;
@@ -5254,7 +5187,6 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
        case -NFS4ERR_WRONG_CRED: /* What to do here? */
                break;
        case -NFS4ERR_DELAY:
-        case -EKEYEXPIRED:
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
@@ -5317,7 +5249,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
                goto out;
        calldata->clp = clp;
        calldata->arg.one_fs = 0;
-        calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        msg.rpc_argp = &calldata->arg;
        msg.rpc_resp = &calldata->res;
@@ -5333,6 +5264,147 @@ out:
        dprintk("<-- %s status=%d\n", __func__, status);
        return status;
 }
+static void
+nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_layoutget *lgp = calldata;
+        struct inode *ino = lgp->args.inode;
+        struct nfs_server *server = NFS_SERVER(ino);
+        dprintk("--> %s\n", __func__);
+        if (nfs4_setup_sequence(server, &lgp->args.seq_args,
+                                &lgp->res.seq_res, 0, task))
+                return;
+        rpc_call_start(task);
+}
+static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_layoutget *lgp = calldata;
+        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+        dprintk("--> %s\n", __func__);
+        if (!nfs4_sequence_done(task, &lgp->res.seq_res))
+                return;
+        switch (task->tk_status) {
+        case 0:
+                break;
+        case -NFS4ERR_LAYOUTTRYLATER:
+        case -NFS4ERR_RECALLCONFLICT:
+                task->tk_status = -NFS4ERR_DELAY;
+                /* Fall through */
+        default:
+                if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+                        rpc_restart_call_prepare(task);
+                        return;
+                }
+        }
+        lgp->status = task->tk_status;
+        dprintk("<-- %s\n", __func__);
+}
+static void nfs4_layoutget_release(void *calldata)
+{
+        struct nfs4_layoutget *lgp = calldata;
+        dprintk("--> %s\n", __func__);
+        put_layout_hdr(lgp->args.inode);
+        if (lgp->res.layout.buf != NULL)
+                free_page((unsigned long) lgp->res.layout.buf);
+        put_nfs_open_context(lgp->args.ctx);
+        kfree(calldata);
+        dprintk("<-- %s\n", __func__);
+}
+static const struct rpc_call_ops nfs4_layoutget_call_ops = {
+        .rpc_call_prepare = nfs4_layoutget_prepare,
+        .rpc_call_done = nfs4_layoutget_done,
+        .rpc_release = nfs4_layoutget_release,
+};
+int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
+{
+        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
+                .rpc_argp = &lgp->args,
+                .rpc_resp = &lgp->res,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = server->client,
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_layoutget_call_ops,
+                .callback_data = lgp,
+                .flags = RPC_TASK_ASYNC,
+        };
+        int status = 0;
+        dprintk("--> %s\n", __func__);
+        lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
+        if (lgp->res.layout.buf == NULL) {
+                nfs4_layoutget_release(lgp);
+                return -ENOMEM;
+        }
+        lgp->res.seq_res.sr_slot = NULL;
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task))
+                return PTR_ERR(task);
+        status = nfs4_wait_for_completion_rpc_task(task);
+        if (status != 0)
+                goto out;
+        status = lgp->status;
+        if (status != 0)
+                goto out;
+        status = pnfs_layout_process(lgp);
+out:
+        rpc_put_task(task);
+        dprintk("<-- %s status=%d\n", __func__, status);
+        return status;
+}
+static int
+_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+        struct nfs4_getdeviceinfo_args args = {
+                .pdev = pdev,
+        };
+        struct nfs4_getdeviceinfo_res res = {
+                .pdev = pdev,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        int status;
+        dprintk("--> %s\n", __func__);
+        status = nfs4_call_sync(server, &msg, &args, &res, 0);
+        dprintk("<-- %s status=%d\n", __func__, status);
+        return status;
+}
+int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+        struct nfs4_exception exception = { };
+        int err;
+        do {
+                err = nfs4_handle_exception(server,
+                                        _nfs4_proc_getdeviceinfo(server, pdev),
+                                        &exception);
+        } while (exception.retry);
+        return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
 #endif /* CONFIG_NFS_V4_1 */
 struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5443,6 +5515,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .unlink_setup   = nfs4_proc_unlink_setup,
        .unlink_done    = nfs4_proc_unlink_done,
        .rename         = nfs4_proc_rename,
+        .rename_setup   = nfs4_proc_rename_setup,
+        .rename_done    = nfs4_proc_rename_done,
        .link           = nfs4_proc_link,
        .symlink        = nfs4_proc_symlink,
        .mkdir          = nfs4_proc_mkdir,
@@ -5463,6 +5537,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .lock           = nfs4_proc_lock,
        .clear_acl_cache = nfs4_zap_acl_attr,
        .close_context  = nfs4_close_context,
+        .open_context   = nfs4_atomic_open,
 };
 /*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 96524c5dca6b..f575a3126737 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -46,6 +46,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/ratelimit.h>
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
@@ -53,6 +54,7 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "pnfs.h"
 #define OPENOWNER_POOL_SIZE     8
@@ -1063,6 +1065,14 @@ restart:
                                /* Mark the file as being 'closed' */
                                state->state = 0;
                                break;
+                        case -EKEYEXPIRED:
+                                /*
+                                 * User RPCSEC_GSS context has expired.
+                                 * We cannot recover this stateid now, so
+                                 * skip it and allow recovery thread to
+                                 * proceed.
+                                 */
+                                break;
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_BAD_STATEID:
@@ -1138,16 +1148,14 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
                (void)ops->reclaim_complete(clp);
 }
-static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
-                return;
+                return 0;
-        nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
@@ -1161,6 +1169,14 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
        }
        nfs_delegation_reap_unclaimed(clp);
+        return 1;
+}
+static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+{
+        if (!nfs4_state_clear_reclaim_reboot(clp))
+                return;
+        nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
 }
 static void nfs_delegation_clear_all(struct nfs_client *clp)
@@ -1175,6 +1191,14 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
        nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
 }
+static void nfs4_warn_keyexpired(const char *s)
+{
+        printk_ratelimited(KERN_WARNING "Error: state manager"
+                        " encountered RPCSEC_GSS session"
+                        " expired against NFSv4 server %s.\n",
+                        s);
+}
 static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 {
        switch (error) {
@@ -1187,7 +1211,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_LEASE_MOVED:
                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-                        nfs4_state_end_reclaim_reboot(clp);
+                        nfs4_state_clear_reclaim_reboot(clp);
                        nfs4_state_start_reclaim_reboot(clp);
                        break;
                case -NFS4ERR_EXPIRED:
@@ -1204,6 +1228,10 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
                        set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
                        /* Zero session reset errors */
                        return 0;
+                case -EKEYEXPIRED:
+                        /* Nothing we can do */
+                        nfs4_warn_keyexpired(clp->cl_hostname);
+                        return 0;
        }
        return error;
 }
@@ -1414,9 +1442,10 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
                case -NFS4ERR_DELAY:
                case -NFS4ERR_CLID_INUSE:
                case -EAGAIN:
-                case -EKEYEXPIRED:
                        break;
+                case -EKEYEXPIRED:
+                        nfs4_warn_keyexpired(clp->cl_hostname);
                case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
                                         * in nfs4_exchange_id */
                default:
@@ -1447,6 +1476,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        }
                        clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
                        set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+                        pnfs_destroy_all_layouts(clp);
                }
                if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 08ef91291132..f313c4cce7e4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,6 +52,7 @@
 #include <linux/nfs_idmap.h>
 #include "nfs4_fs.h"
 #include "internal.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_XDR
@@ -310,6 +311,19 @@ static int nfs4_stat_to_errno(int);
                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz   (op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz   (op_decode_hdr_maxsz + 4)
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
+                                XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
+                                1 /* layout type */ + \
+                                1 /* opaque devaddr4 length */ + \
+                                  /* devaddr4 payload is read into page */ \
+                                1 /* notification bitmap length */ + \
+                                1 /* notification bitmap */)
+#define encode_layoutget_maxsz  (op_encode_hdr_maxsz + 10 + \
+                                encode_stateid_maxsz)
+#define decode_layoutget_maxsz  (op_decode_hdr_maxsz + 8 + \
+                                decode_stateid_maxsz + \
+                                XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz   0
 #define decode_sequence_maxsz   0
@@ -699,6 +713,20 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_reclaim_complete_sz    (compound_decode_hdr_maxsz + \
                                         decode_sequence_maxsz + \
                                         decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
+                                encode_sequence_maxsz +\
+                                encode_getdeviceinfo_maxsz)
+#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz +    \
+                                decode_sequence_maxsz + \
+                                decode_getdeviceinfo_maxsz)
+#define NFS4_enc_layoutget_sz   (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
+                                encode_putfh_maxsz +        \
+                                encode_layoutget_maxsz)
+#define NFS4_dec_layoutget_sz   (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
+                                decode_putfh_maxsz +        \
+                                decode_layoutget_maxsz)
 const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
                                      compound_encode_hdr_maxsz +
@@ -816,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
        if (iap->ia_valid & ATTR_MODE)
                len += 4;
        if (iap->ia_valid & ATTR_UID) {
-                owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name);
+                owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
                if (owner_namelen < 0) {
                        dprintk("nfs: couldn't resolve uid %d to string\n",
                                        iap->ia_uid);
@@ -828,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
        }
        if (iap->ia_valid & ATTR_GID) {
-                owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group);
+                owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
                if (owner_grouplen < 0) {
                        dprintk("nfs: couldn't resolve gid %d to string\n",
                                        iap->ia_gid);
@@ -1385,24 +1413,35 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
-        uint32_t attrs[2] = {
+        uint32_t attrs[2] = {0, 0};
-                FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID,
+        uint32_t dircount = readdir->count >> 1;
-                FATTR4_WORD1_MOUNTED_ON_FILEID,
-        };
        __be32 *p;
+        if (readdir->plus) {
+                attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
+                        FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE;
+                attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
+                        FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
+                        FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
+                        FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+                dircount >>= 1;
+        }
+        attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID;
+        attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
+        /* Switch to mounted_on_fileid if the server supports it */
+        if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
+                attrs[0] &= ~FATTR4_WORD0_FILEID;
+        else
+                attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
        p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
        *p++ = cpu_to_be32(OP_READDIR);
        p = xdr_encode_hyper(p, readdir->cookie);
        p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
-        *p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
+        *p++ = cpu_to_be32(dircount);
        *p++ = cpu_to_be32(readdir->count);
        *p++ = cpu_to_be32(2);
-        /* Switch to mounted_on_fileid if the server supports it */
-        if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
-                attrs[0] &= ~FATTR4_WORD0_FILEID;
-        else
-                attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
        *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
        *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
        hdr->nops++;
@@ -1726,6 +1765,58 @@ static void encode_sequence(struct xdr_stream *xdr,
 #endif /* CONFIG_NFS_V4_1 */
 }
+#ifdef CONFIG_NFS_V4_1
+static void
+encode_getdeviceinfo(struct xdr_stream *xdr,
+                     const struct nfs4_getdeviceinfo_args *args,
+                     struct compound_hdr *hdr)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
+        *p++ = cpu_to_be32(OP_GETDEVICEINFO);
+        p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
+                                    NFS4_DEVICEID4_SIZE);
+        *p++ = cpu_to_be32(args->pdev->layout_type);
+        *p++ = cpu_to_be32(args->pdev->pglen);          /* gdia_maxcount */
+        *p++ = cpu_to_be32(0);                          /* bitmap length 0 */
+        hdr->nops++;
+        hdr->replen += decode_getdeviceinfo_maxsz;
+}
+static void
+encode_layoutget(struct xdr_stream *xdr,
+                      const struct nfs4_layoutget_args *args,
+                      struct compound_hdr *hdr)
+{
+        nfs4_stateid stateid;
+        __be32 *p;
+        p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
+        *p++ = cpu_to_be32(OP_LAYOUTGET);
+        *p++ = cpu_to_be32(0);     /* Signal layout available */
+        *p++ = cpu_to_be32(args->type);
+        *p++ = cpu_to_be32(args->range.iomode);
+        p = xdr_encode_hyper(p, args->range.offset);
+        p = xdr_encode_hyper(p, args->range.length);
+        p = xdr_encode_hyper(p, args->minlength);
+        pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
+                                args->ctx->state);
+        p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
+        *p = cpu_to_be32(args->maxcount);
+        dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
+                __func__,
+                args->type,
+                args->range.iomode,
+                (unsigned long)args->range.offset,
+                (unsigned long)args->range.length,
+                args->maxcount);
+        hdr->nops++;
+        hdr->replen += decode_layoutget_maxsz;
+}
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * END OF "GENERIC" ENCODE ROUTINES.
 */
@@ -1823,7 +1914,7 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
 /*
 * Encode RENAME request
 */
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args)
+static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -2543,6 +2634,51 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
        return 0;
 }
+/*
+ * Encode GETDEVICEINFO request
+ */
+static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
+                                      struct nfs4_getdeviceinfo_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_getdeviceinfo(&xdr, args, &hdr);
+        /* set up reply kvec. Subtract notification bitmap max size (2)
+         * so that notification bitmap is put in xdr_buf tail */
+        xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
+                         args->pdev->pages, args->pdev->pgbase,
+                         args->pdev->pglen);
+        encode_nops(&hdr);
+        return 0;
+}
+/*
+ *  Encode LAYOUTGET request
+ */
+static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
+                                  struct nfs4_layoutget_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
+        encode_layoutget(&xdr, args, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2676,7 +2812,10 @@ out_overflow:
 static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
 {
        if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) {
-                decode_attr_bitmap(xdr, bitmask);
+                int ret;
+                ret = decode_attr_bitmap(xdr, bitmask);
+                if (unlikely(ret < 0))
+                        return ret;
                bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
        } else
                bitmask[0] = bitmask[1] = 0;
@@ -2848,6 +2987,56 @@ out_overflow:
        return -EIO;
 }
+static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+        __be32 *p;
+        if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U)))
+                return -EIO;
+        if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
+{
+        __be32 *p;
+        int len;
+        if (fh != NULL)
+                memset(fh, 0, sizeof(*fh));
+        if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
+                return -EIO;
+        if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                len = be32_to_cpup(p);
+                if (len > NFS4_FHSIZE)
+                        return -EIO;
+                p = xdr_inline_decode(xdr, len);
+                if (unlikely(!p))
+                        goto out_overflow;
+                if (fh != NULL) {
+                        memcpy(fh->data, p, len);
+                        fh->size = len;
+                }
+                bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
        __be32 *p;
@@ -3521,6 +3710,24 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
        return status;
 }
+static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
+                                  struct timespec *time)
+{
+        int status = 0;
+        time->tv_sec = 0;
+        time->tv_nsec = 0;
+        if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U)))
+                return -EIO;
+        if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) {
+                status = decode_attr_time(xdr, time);
+                bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
+        }
+        dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec,
+                (long)time->tv_nsec);
+        return status;
+}
 static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
 {
        int status = 0;
@@ -3744,29 +3951,14 @@ xdr_error:
        return status;
 }
-static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
+                struct nfs_fattr *fattr, struct nfs_fh *fh,
                const struct nfs_server *server, int may_sleep)
 {
-        __be32 *savep;
-        uint32_t attrlen,
-                 bitmap[2] = {0},
-                 type;
        int status;
        umode_t fmode = 0;
        uint64_t fileid;
+        uint32_t type;
-        status = decode_op_hdr(xdr, OP_GETATTR);
-        if (status < 0)
-                goto xdr_error;
-        status = decode_attr_bitmap(xdr, bitmap);
-        if (status < 0)
-                goto xdr_error;
-        status = decode_attr_length(xdr, &attrlen, &savep);
-        if (status < 0)
-                goto xdr_error;
        status = decode_attr_type(xdr, bitmap, &type);
        if (status < 0)
@@ -3792,6 +3984,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
                goto xdr_error;
        fattr->valid |= status;
+        status = decode_attr_error(xdr, bitmap);
+        if (status < 0)
+                goto xdr_error;
+        status = decode_attr_filehandle(xdr, bitmap, fh);
+        if (status < 0)
+                goto xdr_error;
        status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
        if (status < 0)
                goto xdr_error;
@@ -3862,12 +4062,101 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
                fattr->valid |= status;
        }
+xdr_error:
+        dprintk("%s: xdr returned %d\n", __func__, -status);
+        return status;
+}
+static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+                struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
+{
+        __be32 *savep;
+        uint32_t attrlen,
+                 bitmap[2] = {0};
+        int status;
+        status = decode_op_hdr(xdr, OP_GETATTR);
+        if (status < 0)
+                goto xdr_error;
+        status = decode_attr_bitmap(xdr, bitmap);
+        if (status < 0)
+                goto xdr_error;
+        status = decode_attr_length(xdr, &attrlen, &savep);
+        if (status < 0)
+                goto xdr_error;
+        status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
+        if (status < 0)
+                goto xdr_error;
        status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
        dprintk("%s: xdr returned %d\n", __func__, -status);
        return status;
 }
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+                const struct nfs_server *server, int may_sleep)
+{
+        return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
+}
+/*
+ * Decode potentially multiple layout types. Currently we only support
+ * one layout driver per file system.
+ */
+static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
+                                         uint32_t *layouttype)
+{
+        uint32_t *p;
+        int num;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        num = be32_to_cpup(p);
+        /* pNFS is not supported by the underlying file system */
+        if (num == 0) {
+                *layouttype = 0;
+                return 0;
+        }
+        if (num > 1)
+                printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
+                        "per filesystem not supported\n", __func__);
+        /* Decode and set first layout type, move xdr->p past unused types */
+        p = xdr_inline_decode(xdr, num * 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        *layouttype = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * The type of file system exported.
+ * Note we must ensure that layouttype is set in any non-error case.
+ */
+static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
+                                uint32_t *layouttype)
+{
+        int status = 0;
+        dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
+        if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
+                return -EIO;
+        if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+                status = decode_first_pnfs_layout_type(xdr, layouttype);
+                bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
+        } else
+                *layouttype = 0;
+        return status;
+}
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
@@ -3894,6 +4183,12 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
        if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
                goto xdr_error;
        fsinfo->wtpref = fsinfo->wtmax;
+        status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
+        if (status != 0)
+                goto xdr_error;
+        status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
+        if (status != 0)
+                goto xdr_error;
        status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
@@ -3950,13 +4245,13 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
        __be32 *p;
        uint32_t namelen, type;
-        p = xdr_inline_decode(xdr, 32);
+        p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
        if (unlikely(!p))
                goto out_overflow;
-        p = xdr_decode_hyper(p, &offset);
+        p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
        p = xdr_decode_hyper(p, &length);
-        type = be32_to_cpup(p++);
+        type = be32_to_cpup(p++); /* 4 byte read */
-        if (fl != NULL) {
+        if (fl != NULL) { /* manipulate file lock */
                fl->fl_start = (loff_t)offset;
                fl->fl_end = fl->fl_start + (loff_t)length - 1;
                if (length == ~(uint64_t)0)
@@ -3966,9 +4261,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
                        fl->fl_type = F_RDLCK;
                fl->fl_pid = 0;
        }
-        p = xdr_decode_hyper(p, &clientid);
+        p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
-        namelen = be32_to_cpup(p);
+        namelen = be32_to_cpup(p); /* read 4 bytes */  /* have read all 32 bytes now */
-        p = xdr_inline_decode(xdr, namelen);
+        p = xdr_inline_decode(xdr, namelen); /* variable size field */
        if (likely(p))
                return -NFS4ERR_DENIED;
 out_overflow:
@@ -4200,12 +4495,9 @@ out_overflow:
 static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
 {
        struct xdr_buf  *rcvbuf = &req->rq_rcv_buf;
-        struct page     *page = *rcvbuf->pages;
        struct kvec     *iov = rcvbuf->head;
        size_t          hdrlen;
        u32             recvd, pglen = rcvbuf->page_len;
-        __be32          *end, *entry, *p, *kaddr;
-        unsigned int    nr = 0;
        int             status;
        status = decode_op_hdr(xdr, OP_READDIR);
@@ -4225,71 +4517,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
                pglen = recvd;
        xdr_read_pages(xdr, pglen);
-        BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE);
-        kaddr = p = kmap_atomic(page, KM_USER0);
-        end = p + ((pglen + readdir->pgbase) >> 2);
-        entry = p;
-        /* Make sure the packet actually has a value_follows and EOF entry */
-        if ((entry + 1) > end)
-                goto short_pkt;
-        for (; *p++; nr++) {
-                u32 len, attrlen, xlen;
-                if (end - p < 3)
-                        goto short_pkt;
-                dprintk("cookie = %Lu, ", *((unsigned long long *)p));
-                p += 2;                 /* cookie */
-                len = ntohl(*p++);      /* filename length */
-                if (len > NFS4_MAXNAMLEN) {
-                        dprintk("NFS: giant filename in readdir (len 0x%x)\n",
-                                        len);
-                        goto err_unmap;
-                }
-                xlen = XDR_QUADLEN(len);
-                if (end - p < xlen + 1)
-                        goto short_pkt;
-                dprintk("filename = %*s\n", len, (char *)p);
-                p += xlen;
-                len = ntohl(*p++);      /* bitmap length */
-                if (end - p < len + 1)
-                        goto short_pkt;
-                p += len;
-                attrlen = XDR_QUADLEN(ntohl(*p++));
-                if (end - p < attrlen + 2)
-                        goto short_pkt;
-                p += attrlen;           /* attributes */
-                entry = p;
-        }
-        /*
-         * Apparently some server sends responses that are a valid size, but
-         * contain no entries, and have value_follows==0 and EOF==0. For
-         * those, just set the EOF marker.
-         */
-        if (!nr && entry[1] == 0) {
-                dprintk("NFS: readdir reply truncated!\n");
-                entry[1] = 1;
-        }
-out:
-        kunmap_atomic(kaddr, KM_USER0);
        return 0;
-short_pkt:
-        /*
-         * When we get a short packet there are 2 possibilities. We can
-         * return an error, or fix up the response to look like a valid
-         * response and return what we have so far. If there are no
-         * entries and the packet was short, then return -EIO. If there
-         * are valid entries in the response, return them and pretend that
-         * the call was successful, but incomplete. The caller can retry the
-         * readdir starting at the last cookie.
-         */
-        dprintk("%s: short packet at entry %d\n", __func__, nr);
-        entry[0] = entry[1] = 0;
-        if (nr)
-                goto out;
-err_unmap:
-        kunmap_atomic(kaddr, KM_USER0);
-        return -errno_NFSERR_IO;
 }
 static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -4299,7 +4528,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
        size_t hdrlen;
        u32 len, recvd;
        __be32 *p;
-        char *kaddr;
        int status;
        status = decode_op_hdr(xdr, OP_READLINK);
@@ -4330,9 +4558,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
         * and and null-terminate the text (the VFS expects
         * null-termination).
         */
-        kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0);
+        xdr_terminate_string(rcvbuf, len);
-        kaddr[len+rcvbuf->page_base] = '\0';
-        kunmap_atomic(kaddr, KM_USER0);
        return 0;
 out_overflow:
        print_overflow_msg(__func__, xdr);
@@ -4668,7 +4894,6 @@ static int decode_sequence(struct xdr_stream *xdr,
                           struct rpc_rqst *rqstp)
 {
 #if defined(CONFIG_NFS_V4_1)
-        struct nfs4_slot *slot;
        struct nfs4_sessionid id;
        u32 dummy;
        int status;
@@ -4700,15 +4925,14 @@ static int decode_sequence(struct xdr_stream *xdr,
                goto out_overflow;
        /* seqid */
-        slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
        dummy = be32_to_cpup(p++);
-        if (dummy != slot->seq_nr) {
+        if (dummy != res->sr_slot->seq_nr) {
                dprintk("%s Invalid sequence number\n", __func__);
                goto out_err;
        }
        /* slot id */
        dummy = be32_to_cpup(p++);
-        if (dummy != res->sr_slotid) {
+        if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) {
                dprintk("%s Invalid slot id\n", __func__);
                goto out_err;
        }
@@ -4731,6 +4955,134 @@ out_overflow:
 #endif /* CONFIG_NFS_V4_1 */
 }
+#if defined(CONFIG_NFS_V4_1)
+static int decode_getdeviceinfo(struct xdr_stream *xdr,
+                                struct pnfs_device *pdev)
+{
+        __be32 *p;
+        uint32_t len, type;
+        int status;
+        status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
+        if (status) {
+                if (status == -ETOOSMALL) {
+                        p = xdr_inline_decode(xdr, 4);
+                        if (unlikely(!p))
+                                goto out_overflow;
+                        pdev->mincount = be32_to_cpup(p);
+                        dprintk("%s: Min count too small. mincnt = %u\n",
+                                __func__, pdev->mincount);
+                }
+                return status;
+        }
+        p = xdr_inline_decode(xdr, 8);
+        if (unlikely(!p))
+                goto out_overflow;
+        type = be32_to_cpup(p++);
+        if (type != pdev->layout_type) {
+                dprintk("%s: layout mismatch req: %u pdev: %u\n",
+                        __func__, pdev->layout_type, type);
+                return -EINVAL;
+        }
+        /*
+         * Get the length of the opaque device_addr4. xdr_read_pages places
+         * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
+         * and places the remaining xdr data in xdr_buf->tail
+         */
+        pdev->mincount = be32_to_cpup(p);
+        xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
+        /* Parse notification bitmap, verifying that it is zero. */
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        len = be32_to_cpup(p);
+        if (len) {
+                int i;
+                p = xdr_inline_decode(xdr, 4 * len);
+                if (unlikely(!p))
+                        goto out_overflow;
+                for (i = 0; i < len; i++, p++) {
+                        if (be32_to_cpup(p)) {
+                                dprintk("%s: notifications not supported\n",
+                                        __func__);
+                                return -EIO;
+                        }
+                }
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+                            struct nfs4_layoutget_res *res)
+{
+        __be32 *p;
+        int status;
+        u32 layout_count;
+        status = decode_op_hdr(xdr, OP_LAYOUTGET);
+        if (status)
+                return status;
+        p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
+        if (unlikely(!p))
+                goto out_overflow;
+        res->return_on_close = be32_to_cpup(p++);
+        p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
+        layout_count = be32_to_cpup(p);
+        if (!layout_count) {
+                dprintk("%s: server responded with empty layout array\n",
+                        __func__);
+                return -EINVAL;
+        }
+        p = xdr_inline_decode(xdr, 24);
+        if (unlikely(!p))
+                goto out_overflow;
+        p = xdr_decode_hyper(p, &res->range.offset);
+        p = xdr_decode_hyper(p, &res->range.length);
+        res->range.iomode = be32_to_cpup(p++);
+        res->type = be32_to_cpup(p++);
+        status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
+        if (unlikely(status))
+                return status;
+        dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
+                __func__,
+                (unsigned long)res->range.offset,
+                (unsigned long)res->range.length,
+                res->range.iomode,
+                res->type,
+                res->layout.len);
+        /* nfs4_proc_layoutget allocated a single page */
+        if (res->layout.len > PAGE_SIZE)
+                return -ENOMEM;
+        memcpy(res->layout.buf, p, res->layout.len);
+        if (layout_count > 1) {
+                /* We only handle a length one array at the moment.  Any
+                 * further entries are just ignored.  Note that this means
+                 * the client may see a response that is less than the
+                 * minimum it requested.
+                 */
+                dprintk("%s: server responded with %d layouts, dropping tail\n",
+                        __func__, layout_count);
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * END OF "GENERIC" DECODE ROUTINES.
 */
@@ -4873,7 +5225,7 @@ out:
 /*
 * Decode RENAME response
 */
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -5758,25 +6110,84 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
                status = decode_reclaim_complete(&xdr, (void *)NULL);
        return status;
 }
+/*
+ * Decode GETDEVINFO response
+ */
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
+                                      struct nfs4_getdeviceinfo_res *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status != 0)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status != 0)
+                goto out;
+        status = decode_getdeviceinfo(&xdr, res->pdev);
+out:
+        return status;
+}
+/*
+ * Decode LAYOUTGET response
+ */
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
+                                  struct nfs4_layoutget_res *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
+        status = decode_putfh(&xdr);
+        if (status)
+                goto out;
+        status = decode_layoutget(&xdr, rqstp, res);
+out:
+        return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
-__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                           struct nfs_server *server, int plus)
 {
        uint32_t bitmap[2] = {0};
        uint32_t len;
+        __be32 *p = xdr_inline_decode(xdr, 4);
-        if (!*p++) {
+        if (unlikely(!p))
-                if (!*p)
+                goto out_overflow;
+        if (!ntohl(*p++)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                if (!ntohl(*p++))
                        return ERR_PTR(-EAGAIN);
                entry->eof = 1;
                return ERR_PTR(-EBADCOOKIE);
        }
+        p = xdr_inline_decode(xdr, 12);
+        if (unlikely(!p))
+                goto out_overflow;
        entry->prev_cookie = entry->cookie;
        p = xdr_decode_hyper(p, &entry->cookie);
        entry->len = ntohl(*p++);
+        p = xdr_inline_decode(xdr, entry->len);
+        if (unlikely(!p))
+                goto out_overflow;
        entry->name = (const char *) p;
-        p += XDR_QUADLEN(entry->len);
        /*
         * In case the server doesn't return an inode number,
@@ -5784,32 +6195,33 @@ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
         * since glibc seems to choke on it...)
         */
        entry->ino = 1;
+        entry->fattr->valid = 0;
-        len = ntohl(*p++);              /* bitmap length */
+        if (decode_attr_bitmap(xdr, bitmap) < 0)
-        if (len-- > 0) {
+                goto out_overflow;
-                bitmap[0] = ntohl(*p++);
-                if (len-- > 0) {
+        if (decode_attr_length(xdr, &len, &p) < 0)
-                        bitmap[1] = ntohl(*p++);
+                goto out_overflow;
-                        p += len;
-                }
+        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
-        }
+                goto out_overflow;
-        len = XDR_QUADLEN(ntohl(*p++)); /* attribute buffer length */
+        if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
-        if (len > 0) {
+                entry->ino = entry->fattr->fileid;
-                if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) {
-                        bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
+        if (verify_attr_len(xdr, p, len) < 0)
-                        /* Ignore the return value of rdattr_error for now */
+                goto out_overflow;
-                        p++;
-                        len--;
+        p = xdr_inline_peek(xdr, 8);
-                }
+        if (p != NULL)
-                if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID)
+                entry->eof = !p[0] && p[1];
-                        xdr_decode_hyper(p, &entry->ino);
+        else
-                else if (bitmap[0] == FATTR4_WORD0_FILEID)
+                entry->eof = 0;
-                        xdr_decode_hyper(p, &entry->ino);
-                p += len;
-        }
-        entry->eof = !p[0] && p[1];
        return p;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return ERR_PTR(-EIO);
 }
 /*
@@ -5936,6 +6348,8 @@ struct rpc_procinfo	nfs4_procedures[] = {
  PROC(SEQUENCE,        enc_sequence,   dec_sequence),
  PROC(GET_LEASE_TIME,  enc_get_lease_time,     dec_get_lease_time),
  PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
+  PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
+  PROC(LAYOUTGET,  enc_layoutget,     dec_layoutget),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index df101d9f546a..903908a20023 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -3,9 +3,10 @@
 *
 *  Allow an NFS filesystem to be mounted as root. The way this works is:
 *     (1) Use the IP autoconfig mechanism to set local IP addresses and routes.
- *     (2) Handle RPC negotiation with the system which replied to RARP or
+ *     (2) Construct the device string and the options string using DHCP
- *         was reported as a boot server by BOOTP or manually.
+ *         option 17 and/or kernel command line options.
- *     (3) The actual mounting is done later, when init() is running.
+ *     (3) When mount_root() sets up the root file system, pass these strings
+ *         to the NFS client's regular mount interface via sys_mount().
 *
 *
 *      Changes:
@@ -65,470 +66,245 @@
 *      Hua Qin         :       Support for mounting root file system via
 *                              NFS over TCP.
 *      Fabian Frederick:       Option parser rebuilt (using parser lib)
-*/
+ *      Chuck Lever     :       Use super.c's text-based mount option parsing
+ *      Chuck Lever     :       Add "nfsrootdebug".
+ */
 #include <linux/types.h>
 #include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/fs.h>
 #include <linux/init.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/xprtsock.h>
 #include <linux/nfs.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs_mount.h>
-#include <linux/in.h>
-#include <linux/major.h>
 #include <linux/utsname.h>
-#include <linux/inet.h>
 #include <linux/root_dev.h>
 #include <net/ipconfig.h>
-#include <linux/parser.h>
 #include "internal.h"
-/* Define this to allow debugging output */
-#undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
-/* Default port to use if server is not running a portmapper */
-#define NFS_MNT_PORT    627
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT                "/tftpboot/%s"
 /* Parameters passed from the kernel command line */
-static char nfs_root_name[256] __initdata = "";
+static char nfs_root_parms[256] __initdata = "";
+/* Text-based mount options passed to super.c */
+static char nfs_root_options[256] __initdata = "";
 /* Address of NFS server */
-static __be32 servaddr __initdata = 0;
+static __be32 servaddr __initdata = htonl(INADDR_NONE);
 /* Name of directory to mount */
-static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, };
+static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
-/* NFS-related data */
-static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
-static int nfs_port __initdata = 0;             /* Port to connect to for NFS */
-static int mount_port __initdata = 0;           /* Mount daemon port number */
-/***************************************************************************
-                             Parsing of options
- ***************************************************************************/
-enum {
-        /* Options that take integer arguments */
-        Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin,
-        Opt_acregmax, Opt_acdirmin, Opt_acdirmax,
-        /* Options that take no arguments */
-        Opt_soft, Opt_hard, Opt_intr,
-        Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, 
-        Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp,
-        Opt_acl, Opt_noacl,
-        /* Error token */
-        Opt_err
-};
-static const match_table_t tokens __initconst = {
-        {Opt_port, "port=%u"},
-        {Opt_rsize, "rsize=%u"},
-        {Opt_wsize, "wsize=%u"},
-        {Opt_timeo, "timeo=%u"},
-        {Opt_retrans, "retrans=%u"},
-        {Opt_acregmin, "acregmin=%u"},
-        {Opt_acregmax, "acregmax=%u"},
-        {Opt_acdirmin, "acdirmin=%u"},
-        {Opt_acdirmax, "acdirmax=%u"},
-        {Opt_soft, "soft"},
-        {Opt_hard, "hard"},
-        {Opt_intr, "intr"},
-        {Opt_nointr, "nointr"},
-        {Opt_posix, "posix"},
-        {Opt_noposix, "noposix"},
-        {Opt_cto, "cto"},
-        {Opt_nocto, "nocto"},
-        {Opt_ac, "ac"},
-        {Opt_noac, "noac"},
-        {Opt_lock, "lock"},
-        {Opt_nolock, "nolock"},
-        {Opt_v2, "nfsvers=2"},
-        {Opt_v2, "v2"},
-        {Opt_v3, "nfsvers=3"},
-        {Opt_v3, "v3"},
-        {Opt_udp, "proto=udp"},
-        {Opt_udp, "udp"},
-        {Opt_tcp, "proto=tcp"},
-        {Opt_tcp, "tcp"},
-        {Opt_acl, "acl"},
-        {Opt_noacl, "noacl"},
-        {Opt_err, NULL}
-        
-};
+/* server:export path string passed to super.c */
+static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
+#ifdef RPC_DEBUG
 /*
- *  Parse option string.
+ * When the "nfsrootdebug" kernel command line option is specified,
+ * enable debugging messages for NFSROOT.
 */
+static int __init nfs_root_debug(char *__unused)
-static int __init root_nfs_parse(char *name, char *buf)
 {
+        nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
-        char *p;
-        substring_t args[MAX_OPT_ARGS];
-        int option;
-        if (!name)
-                return 1;
-        /* Set the NFS remote path */
-        p = strsep(&name, ",");
-        if (p[0] != '\0' && strcmp(p, "default") != 0)
-                strlcpy(buf, p, NFS_MAXPATHLEN);
-        while ((p = strsep (&name, ",")) != NULL) {
-                int token; 
-                if (!*p)
-                        continue;
-                token = match_token(p, tokens, args);
-                /* %u tokens only. Beware if you add new tokens! */
-                if (token < Opt_soft && match_int(&args[0], &option))
-                        return 0;
-                switch (token) {
-                        case Opt_port:
-                                nfs_port = option;
-                                break;
-                        case Opt_rsize:
-                                nfs_data.rsize = option;
-                                break;
-                        case Opt_wsize:
-                                nfs_data.wsize = option;
-                                break;
-                        case Opt_timeo:
-                                nfs_data.timeo = option;
-                                break;
-                        case Opt_retrans:
-                                nfs_data.retrans = option;
-                                break;
-                        case Opt_acregmin:
-                                nfs_data.acregmin = option;
-                                break;
-                        case Opt_acregmax:
-                                nfs_data.acregmax = option;
-                                break;
-                        case Opt_acdirmin:
-                                nfs_data.acdirmin = option;
-                                break;
-                        case Opt_acdirmax:
-                                nfs_data.acdirmax = option;
-                                break;
-                        case Opt_soft:
-                                nfs_data.flags |= NFS_MOUNT_SOFT;
-                                break;
-                        case Opt_hard:
-                                nfs_data.flags &= ~NFS_MOUNT_SOFT;
-                                break;
-                        case Opt_intr:
-                        case Opt_nointr:
-                                break;
-                        case Opt_posix:
-                                nfs_data.flags |= NFS_MOUNT_POSIX;
-                                break;
-                        case Opt_noposix:
-                                nfs_data.flags &= ~NFS_MOUNT_POSIX;
-                                break;
-                        case Opt_cto:
-                                nfs_data.flags &= ~NFS_MOUNT_NOCTO;
-                                break;
-                        case Opt_nocto:
-                                nfs_data.flags |= NFS_MOUNT_NOCTO;
-                                break;
-                        case Opt_ac:
-                                nfs_data.flags &= ~NFS_MOUNT_NOAC;
-                                break;
-                        case Opt_noac:
-                                nfs_data.flags |= NFS_MOUNT_NOAC;
-                                break;
-                        case Opt_lock:
-                                nfs_data.flags &= ~NFS_MOUNT_NONLM;
-                                break;
-                        case Opt_nolock:
-                                nfs_data.flags |= NFS_MOUNT_NONLM;
-                                break;
-                        case Opt_v2:
-                                nfs_data.flags &= ~NFS_MOUNT_VER3;
-                                break;
-                        case Opt_v3:
-                                nfs_data.flags |= NFS_MOUNT_VER3;
-                                break;
-                        case Opt_udp:
-                                nfs_data.flags &= ~NFS_MOUNT_TCP;
-                                break;
-                        case Opt_tcp:
-                                nfs_data.flags |= NFS_MOUNT_TCP;
-                                break;
-                        case Opt_acl:
-                                nfs_data.flags &= ~NFS_MOUNT_NOACL;
-                                break;
-                        case Opt_noacl:
-                                nfs_data.flags |= NFS_MOUNT_NOACL;
-                                break;
-                        default:
-                                printk(KERN_WARNING "Root-NFS: unknown "
-                                        "option: %s\n", p);
-                                return 0;
-                }
-        }
        return 1;
 }
+__setup("nfsrootdebug", nfs_root_debug);
+#endif
 /*
- *  Prepare the NFS data structure and parse all options.
+ *  Parse NFS server and directory information passed on the kernel
+ *  command line.
+ *
+ *  nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
+ *
+ *  If there is a "%s" token in the <root-dir> string, it is replaced
+ *  by the ASCII-representation of the client's IP address.
 */
-static int __init root_nfs_name(char *name)
+static int __init nfs_root_setup(char *line)
 {
-        static char buf[NFS_MAXPATHLEN] __initdata;
+        ROOT_DEV = Root_NFS;
-        char *cp;
+        if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
-        /* Set some default values */
+                strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
-        memset(&nfs_data, 0, sizeof(nfs_data));
+        } else {
-        nfs_port          = -1;
+                size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
-        nfs_data.version  = NFS_MOUNT_VERSION;
+                if (n >= sizeof(nfs_root_parms))
-        nfs_data.flags    = NFS_MOUNT_NONLM;    /* No lockd in nfs root yet */
+                        line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0';
-        nfs_data.rsize    = NFS_DEF_FILE_IO_SIZE;
+                sprintf(nfs_root_parms, NFS_ROOT, line);
-        nfs_data.wsize    = NFS_DEF_FILE_IO_SIZE;
-        nfs_data.acregmin = NFS_DEF_ACREGMIN;
-        nfs_data.acregmax = NFS_DEF_ACREGMAX;
-        nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
-        nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
-        strcpy(buf, NFS_ROOT);
-        /* Process options received from the remote server */
-        root_nfs_parse(root_server_path, buf);
-        /* Override them by options set on kernel command-line */
-        root_nfs_parse(name, buf);
-        cp = utsname()->nodename;
-        if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) {
-                printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
-                return -1;
        }
-        sprintf(nfs_export_path, buf, cp);
+        /*
+         * Extract the IP address of the NFS server containing our
+         * root file system, if one was specified.
+         *
+         * Note: root_nfs_parse_addr() removes the server-ip from
+         *       nfs_root_parms, if it exists.
+         */
+        root_server_addr = root_nfs_parse_addr(nfs_root_parms);
        return 1;
 }
+__setup("nfsroot=", nfs_root_setup);
-/*
+static int __init root_nfs_copy(char *dest, const char *src,
- *  Get NFS server address.
+                                     const size_t destlen)
- */
-static int __init root_nfs_addr(void)
 {
-        if ((servaddr = root_server_addr) == htonl(INADDR_NONE)) {
+        if (strlcpy(dest, src, destlen) > destlen)
-                printk(KERN_ERR "Root-NFS: No NFS server available, giving up.\n");
                return -1;
-        }
+        return 0;
+}
-        snprintf(nfs_data.hostname, sizeof(nfs_data.hostname),
+static int __init root_nfs_cat(char *dest, const char *src,
-                 "%pI4", &servaddr);
+                                  const size_t destlen)
+{
+        if (strlcat(dest, src, destlen) > destlen)
+                return -1;
        return 0;
 }
 /*
- *  Tell the user what's going on.
+ * Parse out root export path and mount options from
+ * passed-in string @incoming.
+ *
+ * Copy the export path into @exppath.
 */
-#ifdef NFSROOT_DEBUG
+static int __init root_nfs_parse_options(char *incoming, char *exppath,
-static void __init root_nfs_print(void)
+                                         const size_t exppathlen)
 {
-        printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n",
+        char *p;
-                nfs_export_path, nfs_data.hostname);
-        printk(KERN_NOTICE "Root-NFS:     rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
-                nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
-        printk(KERN_NOTICE "Root-NFS:     acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
-                nfs_data.acregmin, nfs_data.acregmax,
-                nfs_data.acdirmin, nfs_data.acdirmax);
-        printk(KERN_NOTICE "Root-NFS:     nfsd port = %d, mountd port = %d, flags = %08x\n",
-                nfs_port, mount_port, nfs_data.flags);
-}
-#endif
-static int __init root_nfs_init(void)
+        /*
-{
+         * Set the NFS remote path
-#ifdef NFSROOT_DEBUG
+         */
-        nfs_debug |= NFSDBG_ROOT;
+        p = strsep(&incoming, ",");
-#endif
+        if (*p != '\0' && strcmp(p, "default") != 0)
+                if (root_nfs_copy(exppath, p, exppathlen))
+                        return -1;
        /*
-         * Decode the root directory path name and NFS options from
+         * @incoming now points to the rest of the string; if it
-         * the kernel command line. This has to go here in order to
+         * contains something, append it to our root options buffer
-         * be able to use the client IP address for the remote root
-         * directory (necessary for pure RARP booting).
         */
-        if (root_nfs_name(nfs_root_name) < 0 ||
+        if (incoming != NULL && *incoming != '\0')
-            root_nfs_addr() < 0)
+                if (root_nfs_cat(nfs_root_options, incoming,
-                return -1;
+                                                sizeof(nfs_root_options)))
+                        return -1;
-#ifdef NFSROOT_DEBUG
+        /*
-        root_nfs_print();
+         * Possibly prepare for more options to be appended
-#endif
+         */
+        if (nfs_root_options[0] != '\0' &&
+            nfs_root_options[strlen(nfs_root_options)] != ',')
+                if (root_nfs_cat(nfs_root_options, ",",
+                                                sizeof(nfs_root_options)))
+                        return -1;
        return 0;
 }
 /*
- *  Parse NFS server and directory information passed on the kernel
+ *  Decode the export directory path name and NFS options from
- *  command line.
+ *  the kernel command line.  This has to be done late in order to
+ *  use a dynamically acquired client IP address for the remote
+ *  root directory path.
+ *
+ *  Returns zero if successful; otherwise -1 is returned.
 */
-static int __init nfs_root_setup(char *line)
+static int __init root_nfs_data(char *cmdline)
 {
-        ROOT_DEV = Root_NFS;
+        char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
-        if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
+        int len, retval = -1;
-                strlcpy(nfs_root_name, line, sizeof(nfs_root_name));
+        char *tmp = NULL;
-        } else {
+        const size_t tmplen = sizeof(nfs_export_path);
-                int n = strlen(line) + sizeof(NFS_ROOT) - 1;
-                if (n >= sizeof(nfs_root_name))
+        tmp = kzalloc(tmplen, GFP_KERNEL);
-                        line[sizeof(nfs_root_name) - sizeof(NFS_ROOT) - 2] = '\0';
+        if (tmp == NULL)
-                sprintf(nfs_root_name, NFS_ROOT, line);
+                goto out_nomem;
+        strcpy(tmp, NFS_ROOT);
+        if (root_server_path[0] != '\0') {
+                dprintk("Root-NFS: DHCPv4 option 17: %s\n",
+                        root_server_path);
+                if (root_nfs_parse_options(root_server_path, tmp, tmplen))
+                        goto out_optionstoolong;
        }
-        root_server_addr = root_nfs_parse_addr(nfs_root_name);
-        return 1;
-}
-__setup("nfsroot=", nfs_root_setup);
-/***************************************************************************
-               Routines to actually mount the root directory
+        if (cmdline[0] != '\0') {
+                dprintk("Root-NFS: nfsroot=%s\n", cmdline);
+                if (root_nfs_parse_options(cmdline, tmp, tmplen))
+                        goto out_optionstoolong;
+        }
- ***************************************************************************/
+        /*
+         * Append mandatory options for nfsroot so they override
+         * what has come before
+         */
+        snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
+                        &servaddr);
+        if (root_nfs_cat(nfs_root_options, addr_option,
+                                                sizeof(nfs_root_options)))
+                goto out_optionstoolong;
-/*
+        /*
- *  Construct sockaddr_in from address and port number.
+         * Set up nfs_root_device.  For NFS mounts, this looks like
- */
+         *
-static inline void
+         *      server:/path
-set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port)
+         *
-{
+         * At this point, utsname()->nodename contains our local
-        sin->sin_family = AF_INET;
+         * IP address or hostname, set by ipconfig.  If "%s" exists
-        sin->sin_addr.s_addr = addr;
+         * in tmp, substitute the nodename, then shovel the whole
-        sin->sin_port = port;
+         * mess into nfs_root_device.
-}
+         */
+        len = snprintf(nfs_export_path, sizeof(nfs_export_path),
+                                tmp, utsname()->nodename);
+        if (len > (int)sizeof(nfs_export_path))
+                goto out_devnametoolong;
+        len = snprintf(nfs_root_device, sizeof(nfs_root_device),
+                                "%pI4:%s", &servaddr, nfs_export_path);
+        if (len > (int)sizeof(nfs_root_device))
+                goto out_devnametoolong;
-/*
+        retval = 0;
- *  Query server portmapper for the port of a daemon program.
- */
-static int __init root_nfs_getport(int program, int version, int proto)
-{
-        struct sockaddr_in sin;
-        printk(KERN_NOTICE "Looking up port of RPC %d/%d on %pI4\n",
+out:
-                program, version, &servaddr);
+        kfree(tmp);
-        set_sockaddr(&sin, servaddr, 0);
+        return retval;
-        return rpcb_getport_sync(&sin, program, version, proto);
+out_nomem:
+        printk(KERN_ERR "Root-NFS: could not allocate memory\n");
+        goto out;
+out_optionstoolong:
+        printk(KERN_ERR "Root-NFS: mount options string too long\n");
+        goto out;
+out_devnametoolong:
+        printk(KERN_ERR "Root-NFS: root device name too long.\n");
+        goto out;
 }
+/**
-/*
+ * nfs_root_data - Return prepared 'data' for NFSROOT mount
- *  Use portmapper to find mountd and nfsd port numbers if not overriden
+ * @root_device: OUT: address of string containing NFSROOT device
- *  by the user. Use defaults if portmapper is not available.
+ * @root_data: OUT: address of string containing NFSROOT mount options
- *  XXX: Is there any nfs server with no portmapper?
+ *
+ * Returns zero and sets @root_device and @root_data if successful,
+ * otherwise -1 is returned.
 */
-static int __init root_nfs_ports(void)
+int __init nfs_root_data(char **root_device, char **root_data)
 {
-        int port;
+        servaddr = root_server_addr;
-        int nfsd_ver, mountd_ver;
+        if (servaddr == htonl(INADDR_NONE)) {
-        int nfsd_port, mountd_port;
+                printk(KERN_ERR "Root-NFS: no NFS server address\n");
-        int proto;
+                return -1;
-        if (nfs_data.flags & NFS_MOUNT_VER3) {
-                nfsd_ver = NFS3_VERSION;
-                mountd_ver = NFS_MNT3_VERSION;
-                nfsd_port = NFS_PORT;
-                mountd_port = NFS_MNT_PORT;
-        } else {
-                nfsd_ver = NFS2_VERSION;
-                mountd_ver = NFS_MNT_VERSION;
-                nfsd_port = NFS_PORT;
-                mountd_port = NFS_MNT_PORT;
-        }
-        proto = (nfs_data.flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
-        if (nfs_port < 0) {
-                if ((port = root_nfs_getport(NFS_PROGRAM, nfsd_ver, proto)) < 0) {
-                        printk(KERN_ERR "Root-NFS: Unable to get nfsd port "
-                                        "number from server, using default\n");
-                        port = nfsd_port;
-                }
-                nfs_port = port;
-                dprintk("Root-NFS: Portmapper on server returned %d "
-                        "as nfsd port\n", port);
        }
-        if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) {
+        if (root_nfs_data(nfs_root_parms) < 0)
-                printk(KERN_ERR "Root-NFS: Unable to get mountd port "
+                return -1;
-                                "number from server, using default\n");
-                port = mountd_port;
-        }
-        mount_port = port;
-        dprintk("Root-NFS: mountd port is %d\n", port);
+        *root_device = nfs_root_device;
+        *root_data = nfs_root_options;
        return 0;
 }
-/*
- *  Get a file handle from the server for the directory which is to be
- *  mounted.
- */
-static int __init root_nfs_get_handle(void)
-{
-        struct sockaddr_in sin;
-        unsigned int auth_flav_len = 0;
-        struct nfs_mount_request request = {
-                .sap            = (struct sockaddr *)&sin,
-                .salen          = sizeof(sin),
-                .dirpath        = nfs_export_path,
-                .version        = (nfs_data.flags & NFS_MOUNT_VER3) ?
-                                        NFS_MNT3_VERSION : NFS_MNT_VERSION,
-                .protocol       = (nfs_data.flags & NFS_MOUNT_TCP) ?
-                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
-                .auth_flav_len  = &auth_flav_len,
-        };
-        int status = -ENOMEM;
-        request.fh = nfs_alloc_fhandle();
-        if (!request.fh)
-                goto out;
-        set_sockaddr(&sin, servaddr, htons(mount_port));
-        status = nfs_mount(&request);
-        if (status < 0)
-                printk(KERN_ERR "Root-NFS: Server returned error %d "
-                                "while mounting %s\n", status, nfs_export_path);
-        else {
-                nfs_data.root.size = request.fh->size;
-                memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
-        }
-        nfs_free_fhandle(request.fh);
-out:
-        return status;
-}
-/*
- *  Get the NFS port numbers and file handle, and return the prepared 'data'
- *  argument for mount() if everything went OK. Return NULL otherwise.
- */
-void * __init nfs_root_data(void)
-{
-        if (root_nfs_init() < 0
-         || root_nfs_ports() < 0
-         || root_nfs_get_handle() < 0)
-                return NULL;
-        set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port));
-        return (void*)&nfs_data;
-}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 919490232e17..137b549e63db 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -65,6 +65,13 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
        if (req == NULL)
                return ERR_PTR(-ENOMEM);
+        /* get lock context early so we can deal with alloc failures */
+        req->wb_lock_context = nfs_get_lock_context(ctx);
+        if (req->wb_lock_context == NULL) {
+                nfs_page_free(req);
+                return ERR_PTR(-ENOMEM);
+        }
        /* Initialize the request struct. Initially, we assume a
         * long write-back delay. This will be adjusted in
         * update_nfs_request below if the region is not locked. */
@@ -79,7 +86,6 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
        req->wb_pgbase  = offset;
        req->wb_bytes   = count;
        req->wb_context = get_nfs_open_context(ctx);
-        req->wb_lock_context = nfs_get_lock_context(ctx);
        kref_init(&req->wb_kref);
        return req;
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
new file mode 100644
index 000000000000..db773428f95f
--- /dev/null
+++ b/fs/nfs/pnfs.c
@@ -0,0 +1,783 @@
+/*
+ *  pNFS functions to call and manage layout drivers.
+ *
+ *  Copyright (c) 2002 [year of first publication]
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "pnfs.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS
+/* Locking:
+ *
+ * pnfs_spinlock:
+ *      protects pnfs_modules_tbl.
+ */
+static DEFINE_SPINLOCK(pnfs_spinlock);
+/*
+ * pnfs_modules_tbl holds all pnfs modules
+ */
+static LIST_HEAD(pnfs_modules_tbl);
+/* Return the registered pnfs layout driver module matching given id */
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver_locked(u32 id)
+{
+        struct pnfs_layoutdriver_type *local;
+        list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
+                if (local->id == id)
+                        goto out;
+        local = NULL;
+out:
+        dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
+        return local;
+}
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver(u32 id)
+{
+        struct pnfs_layoutdriver_type *local;
+        spin_lock(&pnfs_spinlock);
+        local = find_pnfs_driver_locked(id);
+        spin_unlock(&pnfs_spinlock);
+        return local;
+}
+void
+unset_pnfs_layoutdriver(struct nfs_server *nfss)
+{
+        if (nfss->pnfs_curr_ld) {
+                nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
+                module_put(nfss->pnfs_curr_ld->owner);
+        }
+        nfss->pnfs_curr_ld = NULL;
+}
+/*
+ * Try to set the server's pnfs module to the pnfs layout type specified by id.
+ * Currently only one pNFS layout driver per filesystem is supported.
+ *
+ * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+ */
+void
+set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
+{
+        struct pnfs_layoutdriver_type *ld_type = NULL;
+        if (id == 0)
+                goto out_no_driver;
+        if (!(server->nfs_client->cl_exchange_flags &
+                 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
+                printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
+                       id, server->nfs_client->cl_exchange_flags);
+                goto out_no_driver;
+        }
+        ld_type = find_pnfs_driver(id);
+        if (!ld_type) {
+                request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
+                ld_type = find_pnfs_driver(id);
+                if (!ld_type) {
+                        dprintk("%s: No pNFS module found for %u.\n",
+                                __func__, id);
+                        goto out_no_driver;
+                }
+        }
+        if (!try_module_get(ld_type->owner)) {
+                dprintk("%s: Could not grab reference on module\n", __func__);
+                goto out_no_driver;
+        }
+        server->pnfs_curr_ld = ld_type;
+        if (ld_type->set_layoutdriver(server)) {
+                printk(KERN_ERR
+                       "%s: Error initializing mount point for layout driver %u.\n",
+                       __func__, id);
+                module_put(ld_type->owner);
+                goto out_no_driver;
+        }
+        dprintk("%s: pNFS module for %u set\n", __func__, id);
+        return;
+out_no_driver:
+        dprintk("%s: Using NFSv4 I/O\n", __func__);
+        server->pnfs_curr_ld = NULL;
+}
+int
+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+        int status = -EINVAL;
+        struct pnfs_layoutdriver_type *tmp;
+        if (ld_type->id == 0) {
+                printk(KERN_ERR "%s id 0 is reserved\n", __func__);
+                return status;
+        }
+        if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
+                printk(KERN_ERR "%s Layout driver must provide "
+                       "alloc_lseg and free_lseg.\n", __func__);
+                return status;
+        }
+        spin_lock(&pnfs_spinlock);
+        tmp = find_pnfs_driver_locked(ld_type->id);
+        if (!tmp) {
+                list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
+                status = 0;
+                dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
+                        ld_type->name);
+        } else {
+                printk(KERN_ERR "%s Module with id %d already loaded!\n",
+                        __func__, ld_type->id);
+        }
+        spin_unlock(&pnfs_spinlock);
+        return status;
+}
+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
+void
+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+        dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
+        spin_lock(&pnfs_spinlock);
+        list_del(&ld_type->pnfs_tblid);
+        spin_unlock(&pnfs_spinlock);
+}
+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
+/*
+ * pNFS client layout cache
+ */
+static void
+get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+        assert_spin_locked(&lo->inode->i_lock);
+        lo->refcount++;
+}
+static void
+put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+        assert_spin_locked(&lo->inode->i_lock);
+        BUG_ON(lo->refcount == 0);
+        lo->refcount--;
+        if (!lo->refcount) {
+                dprintk("%s: freeing layout cache %p\n", __func__, lo);
+                BUG_ON(!list_empty(&lo->layouts));
+                NFS_I(lo->inode)->layout = NULL;
+                kfree(lo);
+        }
+}
+void
+put_layout_hdr(struct inode *inode)
+{
+        spin_lock(&inode->i_lock);
+        put_layout_hdr_locked(NFS_I(inode)->layout);
+        spin_unlock(&inode->i_lock);
+}
+static void
+init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+{
+        INIT_LIST_HEAD(&lseg->fi_list);
+        kref_init(&lseg->kref);
+        lseg->layout = lo;
+}
+/* Called without i_lock held, as the free_lseg call may sleep */
+static void
+destroy_lseg(struct kref *kref)
+{
+        struct pnfs_layout_segment *lseg =
+                container_of(kref, struct pnfs_layout_segment, kref);
+        struct inode *ino = lseg->layout->inode;
+        dprintk("--> %s\n", __func__);
+        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+        /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+        put_layout_hdr(ino);
+}
+static void
+put_lseg(struct pnfs_layout_segment *lseg)
+{
+        if (!lseg)
+                return;
+        dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+                atomic_read(&lseg->kref.refcount));
+        kref_put(&lseg->kref, destroy_lseg);
+}
+static void
+pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
+{
+        struct pnfs_layout_segment *lseg, *next;
+        struct nfs_client *clp;
+        dprintk("%s:Begin lo %p\n", __func__, lo);
+        assert_spin_locked(&lo->inode->i_lock);
+        list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
+                dprintk("%s: freeing lseg %p\n", __func__, lseg);
+                list_move(&lseg->fi_list, tmp_list);
+        }
+        clp = NFS_SERVER(lo->inode)->nfs_client;
+        spin_lock(&clp->cl_lock);
+        /* List does not take a reference, so no need for put here */
+        list_del_init(&lo->layouts);
+        spin_unlock(&clp->cl_lock);
+        write_seqlock(&lo->seqlock);
+        clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+        write_sequnlock(&lo->seqlock);
+        dprintk("%s:Return\n", __func__);
+}
+static void
+pnfs_free_lseg_list(struct list_head *tmp_list)
+{
+        struct pnfs_layout_segment *lseg;
+        while (!list_empty(tmp_list)) {
+                lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
+                                fi_list);
+                dprintk("%s calling put_lseg on %p\n", __func__, lseg);
+                list_del(&lseg->fi_list);
+                put_lseg(lseg);
+        }
+}
+void
+pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+        struct pnfs_layout_hdr *lo;
+        LIST_HEAD(tmp_list);
+        spin_lock(&nfsi->vfs_inode.i_lock);
+        lo = nfsi->layout;
+        if (lo) {
+                pnfs_clear_lseg_list(lo, &tmp_list);
+                /* Matched by refcount set to 1 in alloc_init_layout_hdr */
+                put_layout_hdr_locked(lo);
+        }
+        spin_unlock(&nfsi->vfs_inode.i_lock);
+        pnfs_free_lseg_list(&tmp_list);
+}
+/*
+ * Called by the state manger to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+        struct pnfs_layout_hdr *lo;
+        LIST_HEAD(tmp_list);
+        spin_lock(&clp->cl_lock);
+        list_splice_init(&clp->cl_layouts, &tmp_list);
+        spin_unlock(&clp->cl_lock);
+        while (!list_empty(&tmp_list)) {
+                lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
+                                layouts);
+                dprintk("%s freeing layout for inode %lu\n", __func__,
+                        lo->inode->i_ino);
+                pnfs_destroy_layout(NFS_I(lo->inode));
+        }
+}
+/* update lo->stateid with new if is more recent
+ *
+ * lo->stateid could be the open stateid, in which case we just use what given.
+ */
+static void
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+                        const nfs4_stateid *new)
+{
+        nfs4_stateid *old = &lo->stateid;
+        bool overwrite = false;
+        write_seqlock(&lo->seqlock);
+        if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
+            memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
+                overwrite = true;
+        else {
+                u32 oldseq, newseq;
+                oldseq = be32_to_cpu(old->stateid.seqid);
+                newseq = be32_to_cpu(new->stateid.seqid);
+                if ((int)(newseq - oldseq) > 0)
+                        overwrite = true;
+        }
+        if (overwrite)
+                memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
+        write_sequnlock(&lo->seqlock);
+}
+static void
+pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
+                              struct nfs4_state *state)
+{
+        int seq;
+        dprintk("--> %s\n", __func__);
+        write_seqlock(&lo->seqlock);
+        do {
+                seq = read_seqbegin(&state->seqlock);
+                memcpy(lo->stateid.data, state->stateid.data,
+                       sizeof(state->stateid.data));
+        } while (read_seqretry(&state->seqlock, seq));
+        set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+        write_sequnlock(&lo->seqlock);
+        dprintk("<-- %s\n", __func__);
+}
+void
+pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+                        struct nfs4_state *open_state)
+{
+        int seq;
+        dprintk("--> %s\n", __func__);
+        do {
+                seq = read_seqbegin(&lo->seqlock);
+                if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
+                        /* This will trigger retry of the read */
+                        pnfs_layout_from_open_stateid(lo, open_state);
+                } else
+                        memcpy(dst->data, lo->stateid.data,
+                               sizeof(lo->stateid.data));
+        } while (read_seqretry(&lo->seqlock, seq));
+        dprintk("<-- %s\n", __func__);
+}
+/*
+* Get layout from server.
+*    for now, assume that whole file layouts are requested.
+*    arg->offset: 0
+*    arg->length: all ones
+*/
+static struct pnfs_layout_segment *
+send_layoutget(struct pnfs_layout_hdr *lo,
+           struct nfs_open_context *ctx,
+           u32 iomode)
+{
+        struct inode *ino = lo->inode;
+        struct nfs_server *server = NFS_SERVER(ino);
+        struct nfs4_layoutget *lgp;
+        struct pnfs_layout_segment *lseg = NULL;
+        dprintk("--> %s\n", __func__);
+        BUG_ON(ctx == NULL);
+        lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
+        if (lgp == NULL) {
+                put_layout_hdr(lo->inode);
+                return NULL;
+        }
+        lgp->args.minlength = NFS4_MAX_UINT64;
+        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+        lgp->args.range.iomode = iomode;
+        lgp->args.range.offset = 0;
+        lgp->args.range.length = NFS4_MAX_UINT64;
+        lgp->args.type = server->pnfs_curr_ld->id;
+        lgp->args.inode = ino;
+        lgp->args.ctx = get_nfs_open_context(ctx);
+        lgp->lsegpp = &lseg;
+        /* Synchronously retrieve layout information from server and
+         * store in lseg.
+         */
+        nfs4_proc_layoutget(lgp);
+        if (!lseg) {
+                /* remember that LAYOUTGET failed and suspend trying */
+                set_bit(lo_fail_bit(iomode), &lo->state);
+        }
+        return lseg;
+}
+/*
+ * Compare two layout segments for sorting into layout cache.
+ * We want to preferentially return RW over RO layouts, so ensure those
+ * are seen first.
+ */
+static s64
+cmp_layout(u32 iomode1, u32 iomode2)
+{
+        /* read > read/write */
+        return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
+}
+static void
+pnfs_insert_layout(struct pnfs_layout_hdr *lo,
+                   struct pnfs_layout_segment *lseg)
+{
+        struct pnfs_layout_segment *lp;
+        int found = 0;
+        dprintk("%s:Begin\n", __func__);
+        assert_spin_locked(&lo->inode->i_lock);
+        if (list_empty(&lo->segs)) {
+                struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
+                spin_lock(&clp->cl_lock);
+                BUG_ON(!list_empty(&lo->layouts));
+                list_add_tail(&lo->layouts, &clp->cl_layouts);
+                spin_unlock(&clp->cl_lock);
+        }
+        list_for_each_entry(lp, &lo->segs, fi_list) {
+                if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
+                        continue;
+                list_add_tail(&lseg->fi_list, &lp->fi_list);
+                dprintk("%s: inserted lseg %p "
+                        "iomode %d offset %llu length %llu before "
+                        "lp %p iomode %d offset %llu length %llu\n",
+                        __func__, lseg, lseg->range.iomode,
+                        lseg->range.offset, lseg->range.length,
+                        lp, lp->range.iomode, lp->range.offset,
+                        lp->range.length);
+                found = 1;
+                break;
+        }
+        if (!found) {
+                list_add_tail(&lseg->fi_list, &lo->segs);
+                dprintk("%s: inserted lseg %p "
+                        "iomode %d offset %llu length %llu at tail\n",
+                        __func__, lseg, lseg->range.iomode,
+                        lseg->range.offset, lseg->range.length);
+        }
+        get_layout_hdr_locked(lo);
+        dprintk("%s:Return\n", __func__);
+}
+static struct pnfs_layout_hdr *
+alloc_init_layout_hdr(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo;
+        lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
+        if (!lo)
+                return NULL;
+        lo->refcount = 1;
+        INIT_LIST_HEAD(&lo->layouts);
+        INIT_LIST_HEAD(&lo->segs);
+        seqlock_init(&lo->seqlock);
+        lo->inode = ino;
+        return lo;
+}
+static struct pnfs_layout_hdr *
+pnfs_find_alloc_layout(struct inode *ino)
+{
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct pnfs_layout_hdr *new = NULL;
+        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
+        assert_spin_locked(&ino->i_lock);
+        if (nfsi->layout)
+                return nfsi->layout;
+        spin_unlock(&ino->i_lock);
+        new = alloc_init_layout_hdr(ino);
+        spin_lock(&ino->i_lock);
+        if (likely(nfsi->layout == NULL))       /* Won the race? */
+                nfsi->layout = new;
+        else
+                kfree(new);
+        return nfsi->layout;
+}
+/*
+ * iomode matching rules:
+ * iomode       lseg    match
+ * -----        -----   -----
+ * ANY          READ    true
+ * ANY          RW      true
+ * RW           READ    false
+ * RW           RW      true
+ * READ         READ    true
+ * READ         RW      true
+ */
+static int
+is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
+{
+        return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
+}
+/*
+ * lookup range in layout
+ */
+static struct pnfs_layout_segment *
+pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+        struct pnfs_layout_segment *lseg, *ret = NULL;
+        dprintk("%s:Begin\n", __func__);
+        assert_spin_locked(&lo->inode->i_lock);
+        list_for_each_entry(lseg, &lo->segs, fi_list) {
+                if (is_matching_lseg(lseg, iomode)) {
+                        ret = lseg;
+                        break;
+                }
+                if (cmp_layout(iomode, lseg->range.iomode) > 0)
+                        break;
+        }
+        dprintk("%s:Return lseg %p ref %d\n",
+                __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
+        return ret;
+}
+/*
+ * Layout segment is retreived from the server if not cached.
+ * The appropriate layout segment is referenced and returned to the caller.
+ */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino,
+                   struct nfs_open_context *ctx,
+                   enum pnfs_iomode iomode)
+{
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct pnfs_layout_hdr *lo;
+        struct pnfs_layout_segment *lseg = NULL;
+        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+                return NULL;
+        spin_lock(&ino->i_lock);
+        lo = pnfs_find_alloc_layout(ino);
+        if (lo == NULL) {
+                dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
+                goto out_unlock;
+        }
+        /* Check to see if the layout for the given range already exists */
+        lseg = pnfs_has_layout(lo, iomode);
+        if (lseg) {
+                dprintk("%s: Using cached lseg %p for iomode %d)\n",
+                        __func__, lseg, iomode);
+                goto out_unlock;
+        }
+        /* if LAYOUTGET already failed once we don't try again */
+        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
+                goto out_unlock;
+        get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
+        spin_unlock(&ino->i_lock);
+        lseg = send_layoutget(lo, ctx, iomode);
+out:
+        dprintk("%s end, state 0x%lx lseg %p\n", __func__,
+                nfsi->layout->state, lseg);
+        return lseg;
+out_unlock:
+        spin_unlock(&ino->i_lock);
+        goto out;
+}
+int
+pnfs_layout_process(struct nfs4_layoutget *lgp)
+{
+        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
+        struct nfs4_layoutget_res *res = &lgp->res;
+        struct pnfs_layout_segment *lseg;
+        struct inode *ino = lo->inode;
+        int status = 0;
+        /* Inject layout blob into I/O device driver */
+        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
+        if (!lseg || IS_ERR(lseg)) {
+                if (!lseg)
+                        status = -ENOMEM;
+                else
+                        status = PTR_ERR(lseg);
+                dprintk("%s: Could not allocate layout: error %d\n",
+                       __func__, status);
+                goto out;
+        }
+        spin_lock(&ino->i_lock);
+        init_lseg(lo, lseg);
+        lseg->range = res->range;
+        *lgp->lsegpp = lseg;
+        pnfs_insert_layout(lo, lseg);
+        /* Done processing layoutget. Set the layout stateid */
+        pnfs_set_layout_stateid(lo, &res->stateid);
+        spin_unlock(&ino->i_lock);
+out:
+        return status;
+}
+/*
+ * Device ID cache. Currently supports one layout type per struct nfs_client.
+ * Add layout type to the lookup key to expand to support multiple types.
+ */
+int
+pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
+                         void (*free_callback)(struct pnfs_deviceid_node *))
+{
+        struct pnfs_deviceid_cache *c;
+        c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
+        if (!c)
+                return -ENOMEM;
+        spin_lock(&clp->cl_lock);
+        if (clp->cl_devid_cache != NULL) {
+                atomic_inc(&clp->cl_devid_cache->dc_ref);
+                dprintk("%s [kref [%d]]\n", __func__,
+                        atomic_read(&clp->cl_devid_cache->dc_ref));
+                kfree(c);
+        } else {
+                /* kzalloc initializes hlists */
+                spin_lock_init(&c->dc_lock);
+                atomic_set(&c->dc_ref, 1);
+                c->dc_free_callback = free_callback;
+                clp->cl_devid_cache = c;
+                dprintk("%s [new]\n", __func__);
+        }
+        spin_unlock(&clp->cl_lock);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
+/*
+ * Called from pnfs_layoutdriver_type->free_lseg
+ * last layout segment reference frees deviceid
+ */
+void
+pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+                  struct pnfs_deviceid_node *devid)
+{
+        struct nfs4_deviceid *id = &devid->de_id;
+        struct pnfs_deviceid_node *d;
+        struct hlist_node *n;
+        long h = nfs4_deviceid_hash(id);
+        dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
+        if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
+                return;
+        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
+                if (!memcmp(&d->de_id, id, sizeof(*id))) {
+                        hlist_del_rcu(&d->de_node);
+                        spin_unlock(&c->dc_lock);
+                        synchronize_rcu();
+                        c->dc_free_callback(devid);
+                        return;
+                }
+        spin_unlock(&c->dc_lock);
+        /* Why wasn't it found in  the list? */
+        BUG();
+}
+EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
+/* Find and reference a deviceid */
+struct pnfs_deviceid_node *
+pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
+{
+        struct pnfs_deviceid_node *d;
+        struct hlist_node *n;
+        long hash = nfs4_deviceid_hash(id);
+        dprintk("--> %s hash %ld\n", __func__, hash);
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
+                if (!memcmp(&d->de_id, id, sizeof(*id))) {
+                        if (!atomic_inc_not_zero(&d->de_ref)) {
+                                goto fail;
+                        } else {
+                                rcu_read_unlock();
+                                return d;
+                        }
+                }
+        }
+fail:
+        rcu_read_unlock();
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
+/*
+ * Add a deviceid to the cache.
+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ */
+struct pnfs_deviceid_node *
+pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
+{
+        struct pnfs_deviceid_node *d;
+        long hash = nfs4_deviceid_hash(&new->de_id);
+        dprintk("--> %s hash %ld\n", __func__, hash);
+        spin_lock(&c->dc_lock);
+        d = pnfs_find_get_deviceid(c, &new->de_id);
+        if (d) {
+                spin_unlock(&c->dc_lock);
+                dprintk("%s [discard]\n", __func__);
+                c->dc_free_callback(new);
+                return d;
+        }
+        INIT_HLIST_NODE(&new->de_node);
+        atomic_set(&new->de_ref, 1);
+        hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
+        spin_unlock(&c->dc_lock);
+        dprintk("%s [new]\n", __func__);
+        return new;
+}
+EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
+void
+pnfs_put_deviceid_cache(struct nfs_client *clp)
+{
+        struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
+        dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
+        if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
+                int i;
+                /* Verify cache is empty */
+                for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
+                        BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
+                clp->cl_devid_cache = NULL;
+                spin_unlock(&clp->cl_lock);
+                kfree(local);
+        }
+}
+EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
new file mode 100644
index 000000000000..e12367d50489
--- /dev/null
+++ b/fs/nfs/pnfs.h
@@ -0,0 +1,189 @@
+/*
+ *  pNFS client data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#ifndef FS_NFS_PNFS_H
+#define FS_NFS_PNFS_H
+struct pnfs_layout_segment {
+        struct list_head fi_list;
+        struct pnfs_layout_range range;
+        struct kref kref;
+        struct pnfs_layout_hdr *layout;
+};
+#ifdef CONFIG_NFS_V4_1
+#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
+enum {
+        NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
+        NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
+        NFS_LAYOUT_STATEID_SET,         /* have a valid layout stateid */
+};
+/* Per-layout driver specific registration structure */
+struct pnfs_layoutdriver_type {
+        struct list_head pnfs_tblid;
+        const u32 id;
+        const char *name;
+        struct module *owner;
+        int (*set_layoutdriver) (struct nfs_server *);
+        int (*clear_layoutdriver) (struct nfs_server *);
+        struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
+        void (*free_lseg) (struct pnfs_layout_segment *lseg);
+};
+struct pnfs_layout_hdr {
+        unsigned long           refcount;
+        struct list_head        layouts;   /* other client layouts */
+        struct list_head        segs;      /* layout segments list */
+        seqlock_t               seqlock;   /* Protects the stateid */
+        nfs4_stateid            stateid;
+        unsigned long           state;
+        struct inode            *inode;
+};
+struct pnfs_device {
+        struct nfs4_deviceid dev_id;
+        unsigned int  layout_type;
+        unsigned int  mincount;
+        struct page **pages;
+        void          *area;
+        unsigned int  pgbase;
+        unsigned int  pglen;
+};
+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS        5
+#define NFS4_DEVICE_ID_HASH_SIZE        (1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK        (NFS4_DEVICE_ID_HASH_SIZE - 1)
+static inline u32
+nfs4_deviceid_hash(struct nfs4_deviceid *id)
+{
+        unsigned char *cptr = (unsigned char *)id->data;
+        unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+        u32 x = 0;
+        while (nbytes--) {
+                x *= 37;
+                x += *cptr++;
+        }
+        return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+struct pnfs_deviceid_node {
+        struct hlist_node       de_node;
+        struct nfs4_deviceid    de_id;
+        atomic_t                de_ref;
+};
+struct pnfs_deviceid_cache {
+        spinlock_t              dc_lock;
+        atomic_t                dc_ref;
+        void                    (*dc_free_callback)(struct pnfs_deviceid_node *);
+        struct hlist_head       dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
+};
+extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
+                        void (*free_callback)(struct pnfs_deviceid_node *));
+extern void pnfs_put_deviceid_cache(struct nfs_client *);
+extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
+                                struct pnfs_deviceid_cache *,
+                                struct nfs4_deviceid *);
+extern struct pnfs_deviceid_node *pnfs_add_deviceid(
+                                struct pnfs_deviceid_cache *,
+                                struct pnfs_deviceid_node *);
+extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+                              struct pnfs_deviceid_node *devid);
+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+/* nfs4proc.c */
+extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+                                   struct pnfs_device *dev);
+extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+/* pnfs.c */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+                   enum pnfs_iomode access_type);
+void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
+void unset_pnfs_layoutdriver(struct nfs_server *);
+int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_destroy_layout(struct nfs_inode *);
+void pnfs_destroy_all_layouts(struct nfs_client *);
+void put_layout_hdr(struct inode *inode);
+void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+                             struct nfs4_state *open_state);
+static inline int lo_fail_bit(u32 iomode)
+{
+        return iomode == IOMODE_RW ?
+                         NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
+}
+/* Return true if a layout driver is being used for this mountpoint */
+static inline int pnfs_enabled_sb(struct nfs_server *nfss)
+{
+        return nfss->pnfs_curr_ld != NULL;
+}
+#else  /* CONFIG_NFS_V4_1 */
+static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+}
+static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+}
+static inline struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+                   enum pnfs_iomode access_type)
+{
+        return NULL;
+}
+static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
+{
+}
+static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 611bec22f552..58e7f84fc1fd 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -258,7 +258,7 @@ static void nfs_free_createdata(const struct nfs_createdata *data)
 static int
 nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-                int flags, struct nameidata *nd)
+                int flags, struct nfs_open_context *ctx)
 {
        struct nfs_createdata *data;
        struct rpc_message msg = {
@@ -365,17 +365,32 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        return 1;
 }
+static void
+nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+        msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
+}
+static int
+nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+                     struct inode *new_dir)
+{
+        if (nfs_async_handle_expired_key(task))
+                return 0;
+        nfs_mark_for_revalidate(old_dir);
+        nfs_mark_for_revalidate(new_dir);
+        return 1;
+}
 static int
 nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
                struct inode *new_dir, struct qstr *new_name)
 {
        struct nfs_renameargs   arg = {
-                .fromfh         = NFS_FH(old_dir),
+                .old_dir        = NFS_FH(old_dir),
-                .fromname       = old_name->name,
+                .old_name       = old_name,
-                .fromlen        = old_name->len,
+                .new_dir        = NFS_FH(new_dir),
-                .tofh           = NFS_FH(new_dir),
+                .new_name       = new_name,
-                .toname         = new_name->name,
-                .tolen          = new_name->len
        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_RENAME],
@@ -519,14 +534,14 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name)
 */
 static int
 nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                 u64 cookie, struct page *page, unsigned int count, int plus)
+                 u64 cookie, struct page **pages, unsigned int count, int plus)
 {
        struct inode            *dir = dentry->d_inode;
        struct nfs_readdirargs  arg = {
                .fh             = NFS_FH(dir),
                .cookie         = cookie,
                .count          = count,
-                .pages          = &page,
+                .pages          = pages,
        };
        struct rpc_message      msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_READDIR],
@@ -705,6 +720,8 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .unlink_setup   = nfs_proc_unlink_setup,
        .unlink_done    = nfs_proc_unlink_done,
        .rename         = nfs_proc_rename,
+        .rename_setup   = nfs_proc_rename_setup,
+        .rename_done    = nfs_proc_rename_done,
        .link           = nfs_proc_link,
        .symlink        = nfs_proc_symlink,
        .mkdir          = nfs_proc_mkdir,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 87adc2744246..e4b62c6f5a6e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -25,6 +25,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
@@ -46,7 +47,6 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
                p->npages = pagecount;
-                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
@@ -121,6 +121,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
        len = nfs_page_length(page);
        if (len == 0)
                return nfs_return_empty_page(page);
+        pnfs_update_layout(inode, ctx, IOMODE_READ);
        new = nfs_create_request(ctx, inode, page, 0, len);
        if (IS_ERR(new)) {
                unlock_page(page);
@@ -625,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        if (ret == 0)
                goto read_complete; /* all pages were read */
+        pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
        if (rsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
        else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f4cbf0c306c6..0a42e8f4adcb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -100,6 +100,7 @@ enum {
        Opt_addr, Opt_mountaddr, Opt_clientaddr,
        Opt_lookupcache,
        Opt_fscache_uniq,
+        Opt_local_lock,
        /* Special mount options */
        Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -171,6 +172,7 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_lookupcache, "lookupcache=%s" },
        { Opt_fscache_uniq, "fsc=%s" },
+        { Opt_local_lock, "local_lock=%s" },
        { Opt_err, NULL }
 };
@@ -236,14 +238,30 @@ static match_table_t nfs_lookupcache_tokens = {
        { Opt_lookupcache_err, NULL }
 };
+enum {
+        Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix,
+        Opt_local_lock_none,
+        Opt_local_lock_err
+};
+static match_table_t nfs_local_lock_tokens = {
+        { Opt_local_lock_all, "all" },
+        { Opt_local_lock_flock, "flock" },
+        { Opt_local_lock_posix, "posix" },
+        { Opt_local_lock_none, "none" },
+        { Opt_local_lock_err, NULL }
+};
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
-static int nfs_xdev_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
-                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+                int flags, const char *dev_name, void *raw_data);
 static void nfs_put_super(struct super_block *);
 static void nfs_kill_super(struct super_block *);
 static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
@@ -259,7 +277,7 @@ static struct file_system_type nfs_fs_type = {
 struct file_system_type nfs_xdev_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs",
-        .get_sb         = nfs_xdev_get_sb,
+        .mount          = nfs_xdev_mount,
        .kill_sb        = nfs_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -284,14 +302,14 @@ static int nfs4_try_mount(int flags, const char *dev_name,
        struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
 static int nfs4_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs4_remote_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
-static int nfs4_xdev_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
 static int nfs4_referral_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
 static void nfs4_kill_super(struct super_block *sb);
 static struct file_system_type nfs4_fs_type = {
@@ -305,7 +323,7 @@ static struct file_system_type nfs4_fs_type = {
 static struct file_system_type nfs4_remote_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_remote_get_sb,
+        .mount          = nfs4_remote_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -313,7 +331,7 @@ static struct file_system_type nfs4_remote_fs_type = {
 struct file_system_type nfs4_xdev_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_xdev_get_sb,
+        .mount          = nfs4_xdev_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -321,7 +339,7 @@ struct file_system_type nfs4_xdev_fs_type = {
 static struct file_system_type nfs4_remote_referral_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_remote_referral_get_sb,
+        .mount          = nfs4_remote_referral_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -622,6 +640,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        const struct proc_nfs_info *nfs_infop;
        struct nfs_client *clp = nfss->nfs_client;
        u32 version = clp->rpc_ops->version;
+        int local_flock, local_fcntl;
        seq_printf(m, ",vers=%u", version);
        seq_printf(m, ",rsize=%u", nfss->rsize);
@@ -670,6 +689,18 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                else
                        seq_printf(m, ",lookupcache=pos");
        }
+        local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
+        local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
+        if (!local_flock && !local_fcntl)
+                seq_printf(m, ",local_lock=none");
+        else if (local_flock && local_fcntl)
+                seq_printf(m, ",local_lock=all");
+        else if (local_flock)
+                seq_printf(m, ",local_lock=flock");
+        else
+                seq_printf(m, ",local_lock=posix");
 }
 /*
@@ -1017,9 +1048,13 @@ static int nfs_parse_mount_options(char *raw,
                        break;
                case Opt_lock:
                        mnt->flags &= ~NFS_MOUNT_NONLM;
+                        mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+                                        NFS_MOUNT_LOCAL_FCNTL);
                        break;
                case Opt_nolock:
                        mnt->flags |= NFS_MOUNT_NONLM;
+                        mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+                                       NFS_MOUNT_LOCAL_FCNTL);
                        break;
                case Opt_v2:
                        mnt->flags &= ~NFS_MOUNT_VER3;
@@ -1420,6 +1455,34 @@ static int nfs_parse_mount_options(char *raw,
                        mnt->fscache_uniq = string;
                        mnt->options |= NFS_OPTION_FSCACHE;
                        break;
+                case Opt_local_lock:
+                        string = match_strdup(args);
+                        if (string == NULL)
+                                goto out_nomem;
+                        token = match_token(string, nfs_local_lock_tokens,
+                                        args);
+                        kfree(string);
+                        switch (token) {
+                        case Opt_local_lock_all:
+                                mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+                                               NFS_MOUNT_LOCAL_FCNTL);
+                                break;
+                        case Opt_local_lock_flock:
+                                mnt->flags |= NFS_MOUNT_LOCAL_FLOCK;
+                                break;
+                        case Opt_local_lock_posix:
+                                mnt->flags |= NFS_MOUNT_LOCAL_FCNTL;
+                                break;
+                        case Opt_local_lock_none:
+                                mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+                                                NFS_MOUNT_LOCAL_FCNTL);
+                                break;
+                        default:
+                                dfprintk(MOUNT, "NFS:   invalid "
+                                                "local_lock argument\n");
+                                return 0;
+                        };
+                        break;
                /*
                 * Special options
@@ -1825,6 +1888,12 @@ static int nfs_validate_mount_data(void *options,
                if (!args->nfs_server.hostname)
                        goto out_nomem;
+                if (!(data->flags & NFS_MOUNT_NONLM))
+                        args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK|
+                                         NFS_MOUNT_LOCAL_FCNTL);
+                else
+                        args->flags |= (NFS_MOUNT_LOCAL_FLOCK|
+                                        NFS_MOUNT_LOCAL_FCNTL);
                /*
                 * The legacy version 6 binary mount data from userspace has a
                 * field used only to transport selinux information into the
@@ -2328,9 +2397,9 @@ static void nfs_kill_super(struct super_block *s)
 /*
 * Clone an NFS2/3 server record on xdev traversal (FSID-change)
 */
-static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *
-                           const char *dev_name, void *raw_data,
+nfs_xdev_mount(struct file_system_type *fs_type, int flags,
-                           struct vfsmount *mnt)
+                const char *dev_name, void *raw_data)
 {
        struct nfs_clone_mount *data = raw_data;
        struct super_block *s;
@@ -2342,7 +2411,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
        };
        int error;
-        dprintk("--> nfs_xdev_get_sb()\n");
+        dprintk("--> nfs_xdev_mount()\n");
        /* create a new volume representation */
        server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2389,28 +2458,26 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
        }
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
        /* clone any lsm security options from the parent to the new sb */
        security_sb_clone_mnt_opts(data->sb, s);
-        dprintk("<-- nfs_xdev_get_sb() = 0\n");
+        dprintk("<-- nfs_xdev_mount() = 0\n");
-        return 0;
+        return mntroot;
 out_err_nosb:
        nfs_free_server(server);
 out_err_noserver:
-        dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error);
+        dprintk("<-- nfs_xdev_mount() = %d [error]\n", error);
-        return error;
+        return ERR_PTR(error);
 error_splat_super:
        if (server && !s->s_root)
                bdi_unregister(&server->backing_dev_info);
 error_splat_bdi:
        deactivate_locked_super(s);
-        dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
+        dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error);
-        return error;
+        return ERR_PTR(error);
 }
 #ifdef CONFIG_NFS_V4
@@ -2441,7 +2508,8 @@ static void nfs4_fill_super(struct super_block *sb)
 static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
 {
-        args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
+        args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3|
+                         NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);
 }
 static int nfs4_validate_text_mount_data(void *options,
@@ -2579,8 +2647,9 @@ out_no_address:
 /*
 * Get the superblock for the NFS4 root partition
 */
-static int nfs4_remote_get_sb(struct file_system_type *fs_type,
+static struct dentry *
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+nfs4_remote_mount(struct file_system_type *fs_type, int flags,
+                  const char *dev_name, void *raw_data)
 {
        struct nfs_parsed_mount_data *data = raw_data;
        struct super_block *s;
@@ -2644,15 +2713,16 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
                goto error_splat_root;
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
+        security_free_mnt_opts(&data->lsm_opts);
-        error = 0;
+        nfs_free_fhandle(mntfh);
+        return mntroot;
 out:
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
        nfs_free_fhandle(mntfh);
-        return error;
+        return ERR_PTR(error);
 out_free:
        nfs_free_server(server);
@@ -2898,9 +2968,9 @@ static void nfs4_kill_super(struct super_block *sb)
 /*
 * Clone an NFS4 server record on xdev traversal (FSID-change)
 */
-static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *
-                            const char *dev_name, void *raw_data,
+nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
-                            struct vfsmount *mnt)
+                 const char *dev_name, void *raw_data)
 {
        struct nfs_clone_mount *data = raw_data;
        struct super_block *s;
@@ -2912,7 +2982,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
        };
        int error;
-        dprintk("--> nfs4_xdev_get_sb()\n");
+        dprintk("--> nfs4_xdev_mount()\n");
        /* create a new volume representation */
        server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2959,32 +3029,30 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
        }
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
        security_sb_clone_mnt_opts(data->sb, s);
-        dprintk("<-- nfs4_xdev_get_sb() = 0\n");
+        dprintk("<-- nfs4_xdev_mount() = 0\n");
-        return 0;
+        return mntroot;
 out_err_nosb:
        nfs_free_server(server);
 out_err_noserver:
-        dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error);
+        dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error);
-        return error;
+        return ERR_PTR(error);
 error_splat_super:
        if (server && !s->s_root)
                bdi_unregister(&server->backing_dev_info);
 error_splat_bdi:
        deactivate_locked_super(s);
-        dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
+        dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error);
-        return error;
+        return ERR_PTR(error);
 }
-static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
+static struct dentry *
-                int flags, const char *dev_name, void *raw_data,
+nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
-                struct vfsmount *mnt)
+                           const char *dev_name, void *raw_data)
 {
        struct nfs_clone_mount *data = raw_data;
        struct super_block *s;
@@ -3048,14 +3116,12 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
        }
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
        security_sb_clone_mnt_opts(data->sb, s);
        nfs_free_fhandle(mntfh);
        dprintk("<-- nfs4_referral_get_sb() = 0\n");
-        return 0;
+        return mntroot;
 out_err_nosb:
        nfs_free_server(server);
@@ -3063,7 +3129,7 @@ out_err_noserver:
        nfs_free_fhandle(mntfh);
 out_err_nofh:
        dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
-        return error;
+        return ERR_PTR(error);
 error_splat_super:
        if (server && !s->s_root)
@@ -3072,7 +3138,7 @@ error_splat_bdi:
        deactivate_locked_super(s);
        nfs_free_fhandle(mntfh);
        dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
-        return error;
+        return ERR_PTR(error);
 }
 /*
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index ad4d2e787b20..978aaeb8a093 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,6 +32,7 @@ static ctl_table nfs_cb_sysctls[] = {
                .extra1 = (int *)&nfs_set_port_min,
                .extra2 = (int *)&nfs_set_port_max,
        },
+#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
        {
                .procname = "idmap_cache_timeout",
                .data = &nfs_idmap_cache_timeout,
@@ -39,6 +40,7 @@ static ctl_table nfs_cb_sysctls[] = {
                .mode = 0644,
                .proc_handler = proc_dointvec_jiffies,
        },
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
 #endif
        {
                .procname       = "nfs_mountpoint_timeout",
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 2f84adaad427..7bdec8531400 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -13,9 +13,12 @@
 #include <linux/nfs_fs.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
+#include <linux/namei.h>
 #include "internal.h"
 #include "nfs4_fs.h"
+#include "iostat.h"
+#include "delegation.h"
 struct nfs_unlinkdata {
        struct hlist_node list;
@@ -244,7 +247,7 @@ void nfs_unblock_sillyrename(struct dentry *dentry)
 * @dir: parent directory of dentry
 * @dentry: dentry to unlink
 */
-int
+static int
 nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct nfs_unlinkdata *data;
@@ -259,7 +262,6 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
                status = PTR_ERR(data->cred);
                goto out_free;
        }
-        data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        data->res.dir_attr = &data->dir_attr;
        status = -EBUSY;
@@ -303,3 +305,256 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
        if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
                nfs_free_unlinkdata(data);
 }
+/* Cancel a queued async unlink. Called when a sillyrename run fails. */
+static void
+nfs_cancel_async_unlink(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+                struct nfs_unlinkdata *data = dentry->d_fsdata;
+                dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+                spin_unlock(&dentry->d_lock);
+                nfs_free_unlinkdata(data);
+                return;
+        }
+        spin_unlock(&dentry->d_lock);
+}
+struct nfs_renamedata {
+        struct nfs_renameargs   args;
+        struct nfs_renameres    res;
+        struct rpc_cred         *cred;
+        struct inode            *old_dir;
+        struct dentry           *old_dentry;
+        struct nfs_fattr        old_fattr;
+        struct inode            *new_dir;
+        struct dentry           *new_dentry;
+        struct nfs_fattr        new_fattr;
+};
+/**
+ * nfs_async_rename_done - Sillyrename post-processing
+ * @task: rpc_task of the sillyrename
+ * @calldata: nfs_renamedata for the sillyrename
+ *
+ * Do the directory attribute updates and the d_move
+ */
+static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs_renamedata *data = calldata;
+        struct inode *old_dir = data->old_dir;
+        struct inode *new_dir = data->new_dir;
+        if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
+                nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
+                return;
+        }
+        if (task->tk_status != 0) {
+                nfs_cancel_async_unlink(data->old_dentry);
+                return;
+        }
+        nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir));
+        d_move(data->old_dentry, data->new_dentry);
+}
+/**
+ * nfs_async_rename_release - Release the sillyrename data.
+ * @calldata: the struct nfs_renamedata to be released
+ */
+static void nfs_async_rename_release(void *calldata)
+{
+        struct nfs_renamedata   *data = calldata;
+        struct super_block *sb = data->old_dir->i_sb;
+        if (data->old_dentry->d_inode)
+                nfs_mark_for_revalidate(data->old_dentry->d_inode);
+        dput(data->old_dentry);
+        dput(data->new_dentry);
+        iput(data->old_dir);
+        iput(data->new_dir);
+        nfs_sb_deactive(sb);
+        put_rpccred(data->cred);
+        kfree(data);
+}
+#if defined(CONFIG_NFS_V4_1)
+static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs_renamedata *data = calldata;
+        struct nfs_server *server = NFS_SERVER(data->old_dir);
+        if (nfs4_setup_sequence(server, &data->args.seq_args,
+                                &data->res.seq_res, 1, task))
+                return;
+        rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
+static const struct rpc_call_ops nfs_rename_ops = {
+        .rpc_call_done = nfs_async_rename_done,
+        .rpc_release = nfs_async_rename_release,
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_rename_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+};
+/**
+ * nfs_async_rename - perform an asynchronous rename operation
+ * @old_dir: directory that currently holds the dentry to be renamed
+ * @new_dir: target directory for the rename
+ * @old_dentry: original dentry to be renamed
+ * @new_dentry: dentry to which the old_dentry should be renamed
+ *
+ * It's expected that valid references to the dentries and inodes are held
+ */
+static struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+                 struct dentry *old_dentry, struct dentry *new_dentry)
+{
+        struct nfs_renamedata *data;
+        struct rpc_message msg = { };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_message = &msg,
+                .callback_ops = &nfs_rename_ops,
+                .workqueue = nfsiod_workqueue,
+                .rpc_client = NFS_CLIENT(old_dir),
+                .flags = RPC_TASK_ASYNC,
+        };
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        if (data == NULL)
+                return ERR_PTR(-ENOMEM);
+        task_setup_data.callback_data = data,
+        data->cred = rpc_lookup_cred();
+        if (IS_ERR(data->cred)) {
+                struct rpc_task *task = ERR_CAST(data->cred);
+                kfree(data);
+                return task;
+        }
+        msg.rpc_argp = &data->args;
+        msg.rpc_resp = &data->res;
+        msg.rpc_cred = data->cred;
+        /* set up nfs_renamedata */
+        data->old_dir = old_dir;
+        ihold(old_dir);
+        data->new_dir = new_dir;
+        ihold(new_dir);
+        data->old_dentry = dget(old_dentry);
+        data->new_dentry = dget(new_dentry);
+        nfs_fattr_init(&data->old_fattr);
+        nfs_fattr_init(&data->new_fattr);
+        /* set up nfs_renameargs */
+        data->args.old_dir = NFS_FH(old_dir);
+        data->args.old_name = &old_dentry->d_name;
+        data->args.new_dir = NFS_FH(new_dir);
+        data->args.new_name = &new_dentry->d_name;
+        /* set up nfs_renameres */
+        data->res.old_fattr = &data->old_fattr;
+        data->res.new_fattr = &data->new_fattr;
+        nfs_sb_active(old_dir->i_sb);
+        NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir);
+        return rpc_run_task(&task_setup_data);
+}
+/**
+ * nfs_sillyrename - Perform a silly-rename of a dentry
+ * @dir: inode of directory that contains dentry
+ * @dentry: dentry to be sillyrenamed
+ *
+ * NFSv2/3 is stateless and the server doesn't know when the client is
+ * holding a file open. To prevent application problems when a file is
+ * unlinked while it's still open, the client performs a "silly-rename".
+ * That is, it renames the file to a hidden file in the same directory,
+ * and only performs the unlink once the last reference to it is put.
+ *
+ * The final cleanup is done during dentry_iput.
+ */
+int
+nfs_sillyrename(struct inode *dir, struct dentry *dentry)
+{
+        static unsigned int sillycounter;
+        const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
+        const int      countersize = sizeof(sillycounter)*2;
+        const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
+        char           silly[slen+1];
+        struct dentry *sdentry;
+        struct rpc_task *task;
+        int            error = -EIO;
+        dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
+                dentry->d_parent->d_name.name, dentry->d_name.name,
+                atomic_read(&dentry->d_count));
+        nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
+        /*
+         * We don't allow a dentry to be silly-renamed twice.
+         */
+        error = -EBUSY;
+        if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+                goto out;
+        sprintf(silly, ".nfs%*.*Lx",
+                fileidsize, fileidsize,
+                (unsigned long long)NFS_FILEID(dentry->d_inode));
+        /* Return delegation in anticipation of the rename */
+        nfs_inode_return_delegation(dentry->d_inode);
+        sdentry = NULL;
+        do {
+                char *suffix = silly + slen - countersize;
+                dput(sdentry);
+                sillycounter++;
+                sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
+                dfprintk(VFS, "NFS: trying to rename %s to %s\n",
+                                dentry->d_name.name, silly);
+                sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+                /*
+                 * N.B. Better to return EBUSY here ... it could be
+                 * dangerous to delete the file while it's in use.
+                 */
+                if (IS_ERR(sdentry))
+                        goto out;
+        } while (sdentry->d_inode != NULL); /* need negative lookup */
+        /* queue unlink first. Can't do this from rpc_release as it
+         * has to allocate memory
+         */
+        error = nfs_async_unlink(dir, dentry);
+        if (error)
+                goto out_dput;
+        /* run the rename task, undo unlink if it fails */
+        task = nfs_async_rename(dir, dir, dentry, sdentry);
+        if (IS_ERR(task)) {
+                error = -EBUSY;
+                nfs_cancel_async_unlink(dentry);
+                goto out_dput;
+        }
+        /* wait for the RPC task to complete, unless a SIGKILL intervenes */
+        error = rpc_wait_for_completion_task(task);
+        if (error == 0)
+                error = task->tk_status;
+        rpc_put_task(task);
+out_dput:
+        dput(sdentry);
+out:
+        return error;
+}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 874972d9427c..4c14c17a5276 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -55,7 +55,6 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
-                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        }
        return p;
 }
@@ -75,7 +74,6 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
                p->npages = pagecount;
-                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
@@ -292,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
        nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
        nfs_pageio_cond_complete(pgio, page->index);
-        ret = nfs_page_async_flush(pgio, page,
+        ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
-                        wbc->sync_mode == WB_SYNC_NONE ||
-                        wbc->nonblocking != 0);
        if (ret == -EAGAIN) {
                redirty_page_for_writepage(wbc, page);
                ret = 0;
@@ -1433,15 +1429,17 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
        int flags = FLUSH_SYNC;
        int ret = 0;
-        /* Don't commit yet if this is a non-blocking flush and there are
+        if (wbc->sync_mode == WB_SYNC_NONE) {
-         * lots of outstanding writes for this mapping.
+                /* Don't commit yet if this is a non-blocking flush and there
-         */
+                 * are a lot of outstanding writes for this mapping.
-        if (wbc->sync_mode == WB_SYNC_NONE &&
+                 */
-            nfsi->ncommit <= (nfsi->npages >> 1))
+                if (nfsi->ncommit <= (nfsi->npages >> 1))
-                goto out_mark_dirty;
+                        goto out_mark_dirty;
-        if (wbc->nonblocking || wbc->for_background)
+                /* don't wait for the COMMIT response */
                flags = 0;
+        }
        ret = nfs_commit_inode(inode, flags);
        if (ret >= 0) {
                if (wbc->sync_mode == WB_SYNC_NONE) {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 7cf4ddafb4ab..18b3e8975fe0 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -2,7 +2,6 @@ config NFSD
        tristate "NFS server support"
        depends on INET
        depends on FILE_LOCKING
-        depends on BKL # fix as soon as lockd is done
        select LOCKD
        select SUNRPC
        select EXPORTFS
@@ -29,6 +28,18 @@ config NFSD
          If unsure, say N.
+config NFSD_DEPRECATED
+        bool "Include support for deprecated syscall interface to NFSD"
+        depends on NFSD
+        default y
+        help
+          The syscall interface to nfsd was obsoleted in 2.6.0 by a new
+          filesystem based interface.  The old interface is due for removal
+          in 2.6.40.  If you wish to remove the interface before then
+          say N.
+          In unsure, say Y.
 config NFSD_V2_ACL
        bool
        depends on NFSD
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c2a4f71d87dd..c0fcb7ab7f6d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -28,9 +28,6 @@
 typedef struct auth_domain      svc_client;
 typedef struct svc_export       svc_export;
-static void             exp_do_unexport(svc_export *unexp);
-static int              exp_verify_string(char *cp, int max);
 /*
 * We have two caches.
 * One maps client+vfsmnt+dentry to export options - the export map
@@ -802,6 +799,7 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
        return ek;
 }
+#ifdef CONFIG_NFSD_DEPRECATED
 static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
                       struct svc_export *exp)
 {
@@ -852,6 +850,7 @@ exp_get_fsid_key(svc_client *clp, int fsid)
        return exp_find_key(clp, FSID_NUM, fsidv, NULL);
 }
+#endif
 static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
                                     struct cache_req *reqp)
@@ -893,6 +892,7 @@ static struct svc_export *exp_parent(svc_client *clp, struct path *path)
        return exp;
 }
+#ifdef CONFIG_NFSD_DEPRECATED
 /*
 * Hashtable locking. Write locks are placed only by user processes
 * wanting to modify export information.
@@ -925,6 +925,19 @@ exp_writeunlock(void)
 {
        up_write(&hash_sem);
 }
+#else
+/* hash_sem not needed once deprecated interface is removed */
+void exp_readlock(void) {}
+static inline void exp_writelock(void){}
+void exp_readunlock(void) {}
+static inline void exp_writeunlock(void){}
+#endif
+#ifdef CONFIG_NFSD_DEPRECATED
+static void             exp_do_unexport(svc_export *unexp);
+static int              exp_verify_string(char *cp, int max);
 static void exp_fsid_unhash(struct svc_export *exp)
 {
@@ -935,10 +948,9 @@ static void exp_fsid_unhash(struct svc_export *exp)
        ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
        if (!IS_ERR(ek)) {
-                ek->h.expiry_time = get_seconds()-1;
+                sunrpc_invalidate(&ek->h, &svc_expkey_cache);
                cache_put(&ek->h, &svc_expkey_cache);
        }
-        svc_expkey_cache.nextcheck = get_seconds();
 }
 static int exp_fsid_hash(svc_client *clp, struct svc_export *exp)
@@ -973,10 +985,9 @@ static void exp_unhash(struct svc_export *exp)
        ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
        if (!IS_ERR(ek)) {
-                ek->h.expiry_time = get_seconds()-1;
+                sunrpc_invalidate(&ek->h, &svc_expkey_cache);
                cache_put(&ek->h, &svc_expkey_cache);
        }
-        svc_expkey_cache.nextcheck = get_seconds();
 }
        
 /*
@@ -1097,8 +1108,7 @@ out:
 static void
 exp_do_unexport(svc_export *unexp)
 {
-        unexp->h.expiry_time = get_seconds()-1;
+        sunrpc_invalidate(&unexp->h, &svc_export_cache);
-        svc_export_cache.nextcheck = get_seconds();
        exp_unhash(unexp);
        exp_fsid_unhash(unexp);
 }
@@ -1150,6 +1160,7 @@ out_unlock:
        exp_writeunlock();
        return err;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 /*
 * Obtain the root fh on behalf of a client.
@@ -1459,25 +1470,43 @@ static void show_secinfo_flags(struct seq_file *m, int flags)
        show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
 }
+static bool secinfo_flags_equal(int f, int g)
+{
+        f &= NFSEXP_SECINFO_FLAGS;
+        g &= NFSEXP_SECINFO_FLAGS;
+        return f == g;
+}
+static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end)
+{
+        int flags;
+        flags = (*fp)->flags;
+        seq_printf(m, ",sec=%d", (*fp)->pseudoflavor);
+        (*fp)++;
+        while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) {
+                seq_printf(m, ":%d", (*fp)->pseudoflavor);
+                (*fp)++;
+        }
+        return flags;
+}
 static void show_secinfo(struct seq_file *m, struct svc_export *exp)
 {
        struct exp_flavor_info *f;
        struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
-        int lastflags = 0, first = 0;
+        int flags;
        if (exp->ex_nflavors == 0)
                return;
-        for (f = exp->ex_flavors; f < end; f++) {
+        f = exp->ex_flavors;
-                if (first || f->flags != lastflags) {
+        flags = show_secinfo_run(m, &f, end);
-                        if (!first)
+        if (!secinfo_flags_equal(flags, exp->ex_flags))
-                                show_secinfo_flags(m, lastflags);
+                show_secinfo_flags(m, flags);
-                        seq_printf(m, ",sec=%d", f->pseudoflavor);
+        while (f != end) {
-                        lastflags = f->flags;
+                flags = show_secinfo_run(m, &f, end);
-                } else {
+                show_secinfo_flags(m, flags);
-                        seq_printf(m, ":%d", f->pseudoflavor);
-                }
        }
-        show_secinfo_flags(m, lastflags);
 }
 static void exp_flags(struct seq_file *m, int flag, int fsid,
@@ -1532,6 +1561,7 @@ const struct seq_operations nfs_exports_op = {
        .show   = e_show,
 };
+#ifdef CONFIG_NFSD_DEPRECATED
 /*
 * Add or modify a client.
 * Change requests may involve the list of host addresses. The list of
@@ -1563,7 +1593,7 @@ exp_addclient(struct nfsctl_client *ncp)
        /* Insert client into hashtable. */
        for (i = 0; i < ncp->cl_naddr; i++) {
                ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6);
-                auth_unix_add_addr(&addr6, dom);
+                auth_unix_add_addr(&init_net, &addr6, dom);
        }
        auth_unix_forget_old(dom);
        auth_domain_put(dom);
@@ -1621,6 +1651,7 @@ exp_verify_string(char *cp, int max)
        printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp);
        return 0;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 /*
 * Initialize the exports module.
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 988cbb3a19b6..143da2eecd7b 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -41,7 +41,6 @@
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
-#define NFS4_STATEID_SIZE 16
 /* Index of predefined Linux callback client operations */
@@ -248,10 +247,11 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
 }
 static void
-encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
+encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
                   struct nfs4_cb_compound_hdr *hdr)
 {
        __be32 *p;
+        struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
        if (hdr->minorversion == 0)
                return;
@@ -259,8 +259,8 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
        RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
        WRITE32(OP_CB_SEQUENCE);
-        WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN);
-        WRITE32(args->cbs_clp->cl_cb_seq_nr);
+        WRITE32(ses->se_cb_seq_nr);
        WRITE32(0);             /* slotid, always 0 */
        WRITE32(0);             /* highest slotid always 0 */
        WRITE32(0);             /* cachethis always 0 */
@@ -280,18 +280,18 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
 static int
 nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
-                struct nfs4_rpc_args *rpc_args)
+                struct nfsd4_callback *cb)
 {
        struct xdr_stream xdr;
-        struct nfs4_delegation *args = rpc_args->args_op;
+        struct nfs4_delegation *args = cb->cb_op;
        struct nfs4_cb_compound_hdr hdr = {
-                .ident = args->dl_ident,
+                .ident = cb->cb_clp->cl_cb_ident,
-                .minorversion = rpc_args->args_seq.cbs_minorversion,
+                .minorversion = cb->cb_minorversion,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_cb_compound_hdr(&xdr, &hdr);
-        encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
+        encode_cb_sequence(&xdr, cb, &hdr);
        encode_cb_recall(&xdr, args, &hdr);
        encode_cb_nops(&hdr);
        return 0;
@@ -339,15 +339,16 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 * with a single slot.
 */
 static int
-decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
+decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
                   struct rpc_rqst *rqstp)
 {
+        struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
        struct nfs4_sessionid id;
        int status;
        u32 dummy;
        __be32 *p;
-        if (res->cbs_minorversion == 0)
+        if (cb->cb_minorversion == 0)
                return 0;
        status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
@@ -363,13 +364,12 @@ decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
        READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
        memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
        p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
-        if (memcmp(id.data, res->cbs_clp->cl_sessionid.data,
+        if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
-                   NFS4_MAX_SESSIONID_LEN)) {
                dprintk("%s Invalid session id\n", __func__);
                goto out;
        }
        READ32(dummy);
-        if (dummy != res->cbs_clp->cl_cb_seq_nr) {
+        if (dummy != ses->se_cb_seq_nr) {
                dprintk("%s Invalid sequence number\n", __func__);
                goto out;
        }
@@ -393,7 +393,7 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
 static int
 nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
-                struct nfsd4_cb_sequence *seq)
+                struct nfsd4_callback *cb)
 {
        struct xdr_stream xdr;
        struct nfs4_cb_compound_hdr hdr;
@@ -403,8 +403,8 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
        status = decode_cb_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        if (seq) {
+        if (cb) {
-                status = decode_cb_sequence(&xdr, seq, rqstp);
+                status = decode_cb_sequence(&xdr, cb, rqstp);
                if (status)
                        goto out;
        }
@@ -473,30 +473,34 @@ static int max_cb_time(void)
 /* Reference counting, callback cleanup, etc., all look racy as heck.
 * And why is cl_cb_set an atomic? */
-int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
+int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 {
        struct rpc_timeout      timeparms = {
                .to_initval     = max_cb_time(),
                .to_retries     = 0,
        };
        struct rpc_create_args args = {
-                .protocol       = XPRT_TRANSPORT_TCP,
+                .net            = &init_net,
-                .address        = (struct sockaddr *) &cb->cb_addr,
+                .address        = (struct sockaddr *) &conn->cb_addr,
-                .addrsize       = cb->cb_addrlen,
+                .addrsize       = conn->cb_addrlen,
                .timeout        = &timeparms,
                .program        = &cb_program,
-                .prognumber     = cb->cb_prog,
                .version        = 0,
                .authflavor     = clp->cl_flavor,
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
-                .client_name    = clp->cl_principal,
        };
        struct rpc_clnt *client;
-        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+        if (clp->cl_minorversion == 0) {
-                return -EINVAL;
+                if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
-        if (cb->cb_minorversion) {
+                        return -EINVAL;
-                args.bc_xprt = cb->cb_xprt;
+                args.client_name = clp->cl_principal;
+                args.prognumber = conn->cb_prog,
+                args.protocol = XPRT_TRANSPORT_TCP;
+                clp->cl_cb_ident = conn->cb_ident;
+        } else {
+                args.bc_xprt = conn->cb_xprt;
+                args.prognumber = clp->cl_cb_session->se_cb_prog;
                args.protocol = XPRT_TRANSPORT_BC_TCP;
        }
        /* Create RPC client */
@@ -506,7 +510,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
                        PTR_ERR(client));
                return PTR_ERR(client);
        }
-        nfsd4_set_callback_client(clp, client);
+        clp->cl_cb_client = client;
        return 0;
 }
@@ -519,7 +523,7 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
 static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 {
-        struct nfs4_client *clp = calldata;
+        struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
        if (task->tk_status)
                warn_no_callback_path(clp, task->tk_status);
@@ -528,6 +532,8 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 }
 static const struct rpc_call_ops nfsd4_cb_probe_ops = {
+        /* XXX: release method to ensure we set the cb channel down if
+         * necessary on early failure? */
        .rpc_call_done = nfsd4_cb_probe_done,
 };
@@ -543,38 +549,42 @@ int set_callback_cred(void)
        return 0;
 }
+static struct workqueue_struct *callback_wq;
-void do_probe_callback(struct nfs4_client *clp)
+static void do_probe_callback(struct nfs4_client *clp)
 {
-        struct rpc_message msg = {
+        struct nfsd4_callback *cb = &clp->cl_cb_null;
-                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-                .rpc_argp       = clp,
-                .rpc_cred       = callback_cred
-        };
-        int status;
-        status = rpc_call_async(clp->cl_cb_client, &msg,
+        cb->cb_op = NULL;
-                                RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+        cb->cb_clp = clp;
-                                &nfsd4_cb_probe_ops, (void *)clp);
-        if (status)
+        cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
-                warn_no_callback_path(clp, status);
+        cb->cb_msg.rpc_argp = NULL;
+        cb->cb_msg.rpc_resp = NULL;
+        cb->cb_msg.rpc_cred = callback_cred;
+        cb->cb_ops = &nfsd4_cb_probe_ops;
+        queue_work(callback_wq, &cb->cb_work);
 }
 /*
- * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
+ * Poke the callback thread to process any updates to the callback
+ * parameters, and send a null probe.
 */
-void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
+void nfsd4_probe_callback(struct nfs4_client *clp)
 {
-        int status;
+        set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+        do_probe_callback(clp);
+}
+void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+{
        BUG_ON(atomic_read(&clp->cl_cb_set));
-        status = setup_callback_client(clp, cb);
+        spin_lock(&clp->cl_lock);
-        if (status) {
+        memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
-                warn_no_callback_path(clp, status);
+        spin_unlock(&clp->cl_lock);
-                return;
-        }
-        do_probe_callback(clp);
 }
 /*
@@ -585,8 +595,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
 static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
                struct rpc_task *task)
 {
-        struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+        u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
-        u32 *ptr = (u32 *)clp->cl_sessionid.data;
        int status = 0;
        dprintk("%s: %u:%u:%u:%u\n", __func__,
@@ -598,14 +607,6 @@ static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
                status = -EAGAIN;
                goto out;
        }
-        /*
-         * We'll need the clp during XDR encoding and decoding,
-         * and the sequence during decoding to verify the reply
-         */
-        args->args_seq.cbs_clp = clp;
-        task->tk_msg.rpc_resp = &args->args_seq;
 out:
        dprintk("%s status=%d\n", __func__, status);
        return status;
@@ -617,13 +618,13 @@ out:
 */
 static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 {
-        struct nfs4_delegation *dp = calldata;
+        struct nfsd4_callback *cb = calldata;
+        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        struct nfs4_client *clp = dp->dl_client;
-        struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+        u32 minorversion = clp->cl_minorversion;
-        u32 minorversion = clp->cl_cb_conn.cb_minorversion;
        int status = 0;
-        args->args_seq.cbs_minorversion = minorversion;
+        cb->cb_minorversion = minorversion;
        if (minorversion) {
                status = nfsd41_cb_setup_sequence(clp, task);
                if (status) {
@@ -640,19 +641,20 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 {
-        struct nfs4_delegation *dp = calldata;
+        struct nfsd4_callback *cb = calldata;
+        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        struct nfs4_client *clp = dp->dl_client;
        dprintk("%s: minorversion=%d\n", __func__,
-                clp->cl_cb_conn.cb_minorversion);
+                clp->cl_minorversion);
-        if (clp->cl_cb_conn.cb_minorversion) {
+        if (clp->cl_minorversion) {
                /* No need for lock, access serialized in nfsd4_cb_prepare */
-                ++clp->cl_cb_seq_nr;
+                ++clp->cl_cb_session->se_cb_seq_nr;
                clear_bit(0, &clp->cl_cb_slot_busy);
                rpc_wake_up_next(&clp->cl_cb_waitq);
                dprintk("%s: freed slot, new seqid=%d\n", __func__,
-                        clp->cl_cb_seq_nr);
+                        clp->cl_cb_session->se_cb_seq_nr);
                /* We're done looking into the sequence information */
                task->tk_msg.rpc_resp = NULL;
@@ -662,7 +664,8 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
-        struct nfs4_delegation *dp = calldata;
+        struct nfsd4_callback *cb = calldata;
+        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        struct nfs4_client *clp = dp->dl_client;
        struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
@@ -707,7 +710,8 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 static void nfsd4_cb_recall_release(void *calldata)
 {
-        struct nfs4_delegation *dp = calldata;
+        struct nfsd4_callback *cb = calldata;
+        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        nfs4_put_delegation(dp);
 }
@@ -718,8 +722,6 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
        .rpc_release = nfsd4_cb_recall_release,
 };
-static struct workqueue_struct *callback_wq;
 int nfsd4_create_callback_queue(void)
 {
        callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
@@ -734,57 +736,88 @@ void nfsd4_destroy_callback_queue(void)
 }
 /* must be called under the state lock */
-void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
+void nfsd4_shutdown_callback(struct nfs4_client *clp)
 {
-        struct rpc_clnt *old = clp->cl_cb_client;
+        set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags);
-        clp->cl_cb_client = new;
        /*
-         * After this, any work that saw the old value of cl_cb_client will
+         * Note this won't actually result in a null callback;
-         * be gone:
+         * instead, nfsd4_do_callback_rpc() will detect the killed
+         * client, destroy the rpc client, and stop:
         */
+        do_probe_callback(clp);
        flush_workqueue(callback_wq);
-        /* So we can safely shut it down: */
-        if (old)
-                rpc_shutdown_client(old);
 }
-/*
+void nfsd4_release_cb(struct nfsd4_callback *cb)
- * called with dp->dl_count inc'ed.
- */
-static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
-        struct nfs4_client *clp = dp->dl_client;
+        if (cb->cb_ops->rpc_release)
-        struct rpc_clnt *clnt = clp->cl_cb_client;
+                cb->cb_ops->rpc_release(cb);
-        struct nfs4_rpc_args *args = &dp->dl_recall.cb_args;
+}
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
-                .rpc_cred = callback_cred
-        };
-        if (clnt == NULL) {
+void nfsd4_process_cb_update(struct nfsd4_callback *cb)
-                nfs4_put_delegation(dp);
+{
-                return; /* Client is shutting down; give up. */
+        struct nfs4_cb_conn conn;
+        struct nfs4_client *clp = cb->cb_clp;
+        int err;
+        /*
+         * This is either an update, or the client dying; in either case,
+         * kill the old client:
+         */
+        if (clp->cl_cb_client) {
+                rpc_shutdown_client(clp->cl_cb_client);
+                clp->cl_cb_client = NULL;
        }
+        if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
+                return;
+        spin_lock(&clp->cl_lock);
+        /*
+         * Only serialized callback code is allowed to clear these
+         * flags; main nfsd code can only set them:
+         */
+        BUG_ON(!clp->cl_cb_flags);
+        clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+        memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
+        spin_unlock(&clp->cl_lock);
-        args->args_op = dp;
+        err = setup_callback_client(clp, &conn);
-        msg.rpc_argp = args;
+        if (err)
-        dp->dl_retries = 1;
+                warn_no_callback_path(clp, err);
-        rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp);
 }
 void nfsd4_do_callback_rpc(struct work_struct *w)
 {
-        /* XXX: for now, just send off delegation recall. */
+        struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
-        /* In future, generalize to handle any sort of callback. */
+        struct nfs4_client *clp = cb->cb_clp;
-        struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work);
+        struct rpc_clnt *clnt;
-        struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
-        _nfsd4_cb_recall(dp);
+        if (clp->cl_cb_flags)
-}
+                nfsd4_process_cb_update(cb);
+        clnt = clp->cl_cb_client;
+        if (!clnt) {
+                /* Callback channel broken, or client killed; give up: */
+                nfsd4_release_cb(cb);
+                return;
+        }
+        rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+                        cb->cb_ops, cb);
+}
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
+        struct nfsd4_callback *cb = &dp->dl_recall;
+        dp->dl_retries = 1;
+        cb->cb_op = dp;
+        cb->cb_clp = dp->dl_client;
+        cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
+        cb->cb_msg.rpc_argp = cb;
+        cb->cb_msg.rpc_resp = cb;
+        cb->cb_msg.rpc_cred = callback_cred;
+        cb->cb_ops = &nfsd4_cb_recall_ops;
+        dp->dl_retries = 1;
        queue_work(callback_wq, &dp->dl_recall.cb_work);
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index c78dbf493424..f0695e815f0e 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -482,109 +482,26 @@ nfsd_idmap_shutdown(void)
        cache_unregister(&nametoid_cache);
 }
-/*
- * Deferred request handling
- */
-struct idmap_defer_req {
-       struct cache_req         req;
-       struct cache_deferred_req deferred_req;
-       wait_queue_head_t        waitq;
-       atomic_t                 count;
-};
-static inline void
-put_mdr(struct idmap_defer_req *mdr)
-{
-        if (atomic_dec_and_test(&mdr->count))
-                kfree(mdr);
-}
-static inline void
-get_mdr(struct idmap_defer_req *mdr)
-{
-        atomic_inc(&mdr->count);
-}
-static void
-idmap_revisit(struct cache_deferred_req *dreq, int toomany)
-{
-        struct idmap_defer_req *mdr =
-                container_of(dreq, struct idmap_defer_req, deferred_req);
-        wake_up(&mdr->waitq);
-        put_mdr(mdr);
-}
-static struct cache_deferred_req *
-idmap_defer(struct cache_req *req)
-{
-        struct idmap_defer_req *mdr =
-                container_of(req, struct idmap_defer_req, req);
-        mdr->deferred_req.revisit = idmap_revisit;
-        get_mdr(mdr);
-        return (&mdr->deferred_req);
-}
-static inline int
-do_idmap_lookup(struct ent *(*lookup_fn)(struct ent *), struct ent *key,
-                struct cache_detail *detail, struct ent **item,
-                struct idmap_defer_req *mdr)
-{
-        *item = lookup_fn(key);
-        if (!*item)
-                return -ENOMEM;
-        return cache_check(detail, &(*item)->h, &mdr->req);
-}
-static inline int
-do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *),
-                        struct ent *key, struct cache_detail *detail,
-                        struct ent **item)
-{
-        int ret = -ENOMEM;
-        *item = lookup_fn(key);
-        if (!*item)
-                goto out_err;
-        ret = -ETIMEDOUT;
-        if (!test_bit(CACHE_VALID, &(*item)->h.flags)
-                        || (*item)->h.expiry_time < get_seconds()
-                        || detail->flush_time > (*item)->h.last_refresh)
-                goto out_put;
-        ret = -ENOENT;
-        if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
-                goto out_put;
-        return 0;
-out_put:
-        cache_put(&(*item)->h, detail);
-out_err:
-        *item = NULL;
-        return ret;
-}
 static int
 idmap_lookup(struct svc_rqst *rqstp,
                struct ent *(*lookup_fn)(struct ent *), struct ent *key,
                struct cache_detail *detail, struct ent **item)
 {
-        struct idmap_defer_req *mdr;
        int ret;
-        mdr = kzalloc(sizeof(*mdr), GFP_KERNEL);
+        *item = lookup_fn(key);
-        if (!mdr)
+        if (!*item)
                return -ENOMEM;
-        atomic_set(&mdr->count, 1);
+ retry:
-        init_waitqueue_head(&mdr->waitq);
+        ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle);
-        mdr->req.defer = idmap_defer;
-        ret = do_idmap_lookup(lookup_fn, key, detail, item, mdr);
+        if (ret == -ETIMEDOUT) {
-        if (ret == -EAGAIN) {
+                struct ent *prev_item = *item;
-                wait_event_interruptible_timeout(mdr->waitq,
+                *item = lookup_fn(key);
-                        test_bit(CACHE_VALID, &(*item)->h.flags), 1 * HZ);
+                if (*item != prev_item)
-                ret = do_idmap_lookup_nowait(lookup_fn, key, detail, item);
+                        goto retry;
+                cache_put(&(*item)->h, detail);
        }
-        put_mdr(mdr);
        return ret;
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 59ec449b0c7f..0cdfd022bb7b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1031,8 +1031,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        resp->cstate.session = NULL;
        fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
        fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
-        /* Use the deferral mechanism only for NFSv4.0 compounds */
+        /*
-        rqstp->rq_usedeferral = (args->minorversion == 0);
+         * Don't use the deferral mechanism for NFSv4; compounds make it
+         * too hard to avoid non-idempotency problems.
+         */
+        rqstp->rq_usedeferral = 0;
        /*
         * According to RFC3010, this takes precedence over all other errors.
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a7292fcf7718..56347e0ac88d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -207,7 +207,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 {
        struct nfs4_delegation *dp;
        struct nfs4_file *fp = stp->st_file;
-        struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
        dprintk("NFSD alloc_init_deleg\n");
        /*
@@ -234,7 +233,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        nfs4_file_get_access(fp, O_RDONLY);
        dp->dl_flock = NULL;
        dp->dl_type = type;
-        dp->dl_ident = cb->cb_ident;
        dp->dl_stateid.si_boot = boot_time;
        dp->dl_stateid.si_stateownerid = current_delegid++;
        dp->dl_stateid.si_fileid = 0;
@@ -535,171 +533,258 @@ gen_sessionid(struct nfsd4_session *ses)
 */
 #define NFSD_MIN_HDR_SEQ_SZ  (24 + 12 + 44)
+static void
+free_session_slots(struct nfsd4_session *ses)
+{
+        int i;
+        for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+                kfree(ses->se_slots[i]);
+}
 /*
- * Give the client the number of ca_maxresponsesize_cached slots it
+ * We don't actually need to cache the rpc and session headers, so we
- * requests, of size bounded by NFSD_SLOT_CACHE_SIZE,
+ * can allocate a little less for each slot:
- * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more
+ */
- * than NFSD_MAX_SLOTS_PER_SESSION.
+static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
- *
+{
- * If we run out of reserved DRC memory we should (up to a point)
+        return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+}
+static int nfsd4_sanitize_slot_size(u32 size)
+{
+        size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */
+        size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE);
+        return size;
+}
+/*
+ * XXX: If we run out of reserved DRC memory we could (up to a point)
 * re-negotiate active sessions and reduce their slot usage to make
 * rooom for new connections. For now we just fail the create session.
 */
-static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
+static int nfsd4_get_drc_mem(int slotsize, u32 num)
 {
-        int mem, size = fchan->maxresp_cached;
+        int avail;
-        if (fchan->maxreqs < 1)
+        num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
-                return nfserr_inval;
-        if (size < NFSD_MIN_HDR_SEQ_SZ)
+        spin_lock(&nfsd_drc_lock);
-                size = NFSD_MIN_HDR_SEQ_SZ;
+        avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
-        size -= NFSD_MIN_HDR_SEQ_SZ;
+                        nfsd_drc_max_mem - nfsd_drc_mem_used);
-        if (size > NFSD_SLOT_CACHE_SIZE)
+        num = min_t(int, num, avail / slotsize);
-                size = NFSD_SLOT_CACHE_SIZE;
+        nfsd_drc_mem_used += num * slotsize;
+        spin_unlock(&nfsd_drc_lock);
-        /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */
-        mem = fchan->maxreqs * size;
-        if (mem > NFSD_MAX_MEM_PER_SESSION) {
-                fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
-                if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
-                        fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
-                mem = fchan->maxreqs * size;
-        }
+        return num;
+}
+static void nfsd4_put_drc_mem(int slotsize, int num)
+{
        spin_lock(&nfsd_drc_lock);
-        /* bound the total session drc memory ussage */
+        nfsd_drc_mem_used -= slotsize * num;
-        if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
-                fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
-                mem = fchan->maxreqs * size;
-        }
-        nfsd_drc_mem_used += mem;
        spin_unlock(&nfsd_drc_lock);
+}
-        if (fchan->maxreqs == 0)
+static struct nfsd4_session *alloc_session(int slotsize, int numslots)
-                return nfserr_jukebox;
+{
+        struct nfsd4_session *new;
+        int mem, i;
-        fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
+        BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
-        return 0;
+                        + sizeof(struct nfsd4_session) > PAGE_SIZE);
+        mem = numslots * sizeof(struct nfsd4_slot *);
+        new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
+        if (!new)
+                return NULL;
+        /* allocate each struct nfsd4_slot and data cache in one piece */
+        for (i = 0; i < numslots; i++) {
+                mem = sizeof(struct nfsd4_slot) + slotsize;
+                new->se_slots[i] = kzalloc(mem, GFP_KERNEL);
+                if (!new->se_slots[i])
+                        goto out_free;
+        }
+        return new;
+out_free:
+        while (i--)
+                kfree(new->se_slots[i]);
+        kfree(new);
+        return NULL;
 }
-/*
+static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize)
- * fchan holds the client values on input, and the server values on output
- * sv_max_mesg is the maximum payload plus one page for overhead.
- */
-static int init_forechannel_attrs(struct svc_rqst *rqstp,
-                                  struct nfsd4_channel_attrs *session_fchan,
-                                  struct nfsd4_channel_attrs *fchan)
 {
-        int status = 0;
+        u32 maxrpc = nfsd_serv->sv_max_mesg;
-        __u32   maxcount = nfsd_serv->sv_max_mesg;
-        /* headerpadsz set to zero in encode routine */
+        new->maxreqs = numslots;
+        new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ;
+        new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
+        new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
+        new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
+}
-        /* Use the client's max request and max response size if possible */
+static void free_conn(struct nfsd4_conn *c)
-        if (fchan->maxreq_sz > maxcount)
+{
-                fchan->maxreq_sz = maxcount;
+        svc_xprt_put(c->cn_xprt);
-        session_fchan->maxreq_sz = fchan->maxreq_sz;
+        kfree(c);
+}
-        if (fchan->maxresp_sz > maxcount)
+static void nfsd4_conn_lost(struct svc_xpt_user *u)
-                fchan->maxresp_sz = maxcount;
+{
-        session_fchan->maxresp_sz = fchan->maxresp_sz;
+        struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
+        struct nfs4_client *clp = c->cn_session->se_client;
-        /* Use the client's maxops if possible */
+        spin_lock(&clp->cl_lock);
-        if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
+        if (!list_empty(&c->cn_persession)) {
-                fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
+                list_del(&c->cn_persession);
-        session_fchan->maxops = fchan->maxops;
+                free_conn(c);
+        }
+        spin_unlock(&clp->cl_lock);
+}
-        /* FIXME: Error means no more DRC pages so the server should
+static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
-         * recover pages from existing sessions. For now fail session
+{
-         * creation.
+        struct nfsd4_conn *conn;
-         */
-        status = set_forechannel_drc_size(fchan);
-        session_fchan->maxresp_cached = fchan->maxresp_cached;
+        conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL);
-        session_fchan->maxreqs = fchan->maxreqs;
+        if (!conn)
+                return NULL;
+        svc_xprt_get(rqstp->rq_xprt);
+        conn->cn_xprt = rqstp->rq_xprt;
+        conn->cn_flags = flags;
+        INIT_LIST_HEAD(&conn->cn_xpt_user.list);
+        return conn;
+}
-        dprintk("%s status %d\n", __func__, status);
+static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
-        return status;
+{
+        conn->cn_session = ses;
+        list_add(&conn->cn_persession, &ses->se_conns);
 }
-static void
+static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
-free_session_slots(struct nfsd4_session *ses)
 {
-        int i;
+        struct nfs4_client *clp = ses->se_client;
-        for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+        spin_lock(&clp->cl_lock);
-                kfree(ses->se_slots[i]);
+        __nfsd4_hash_conn(conn, ses);
+        spin_unlock(&clp->cl_lock);
 }
-/*
+static void nfsd4_register_conn(struct nfsd4_conn *conn)
- * We don't actually need to cache the rpc and session headers, so we
- * can allocate a little less for each slot:
- */
-static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
 {
-        return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+        conn->cn_xpt_user.callback = nfsd4_conn_lost;
+        register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
 }
-static int
+static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
-alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
-                   struct nfsd4_create_session *cses)
 {
-        struct nfsd4_session *new, tmp;
+        struct nfsd4_conn *conn;
-        struct nfsd4_slot *sp;
+        u32 flags = NFS4_CDFC4_FORE;
-        int idx, slotsize, cachesize, i;
-        int status;
-        memset(&tmp, 0, sizeof(tmp));
+        if (ses->se_flags & SESSION4_BACK_CHAN)
+                flags |= NFS4_CDFC4_BACK;
+        conn = alloc_conn(rqstp, flags);
+        if (!conn)
+                return nfserr_jukebox;
+        nfsd4_hash_conn(conn, ses);
+        nfsd4_register_conn(conn);
+        return nfs_ok;
+}
-        /* FIXME: For now, we just accept the client back channel attributes. */
+static void nfsd4_del_conns(struct nfsd4_session *s)
-        tmp.se_bchannel = cses->back_channel;
+{
-        status = init_forechannel_attrs(rqstp, &tmp.se_fchannel,
+        struct nfs4_client *clp = s->se_client;
-                                        &cses->fore_channel);
+        struct nfsd4_conn *c;
-        if (status)
-                goto out;
-        BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
+        spin_lock(&clp->cl_lock);
-                     + sizeof(struct nfsd4_session) > PAGE_SIZE);
+        while (!list_empty(&s->se_conns)) {
+                c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession);
+                list_del_init(&c->cn_persession);
+                spin_unlock(&clp->cl_lock);
-        status = nfserr_jukebox;
+                unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user);
-        /* allocate struct nfsd4_session and slot table pointers in one piece */
+                free_conn(c);
-        slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
-        new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
-        if (!new)
-                goto out;
-        memcpy(new, &tmp, sizeof(*new));
+                spin_lock(&clp->cl_lock);
+        }
+        spin_unlock(&clp->cl_lock);
+}
-        /* allocate each struct nfsd4_slot and data cache in one piece */
+void free_session(struct kref *kref)
-        cachesize = slot_bytes(&new->se_fchannel);
+{
-        for (i = 0; i < new->se_fchannel.maxreqs; i++) {
+        struct nfsd4_session *ses;
-                sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
+        int mem;
-                if (!sp)
-                        goto out_free;
+        ses = container_of(kref, struct nfsd4_session, se_ref);
-                new->se_slots[i] = sp;
+        nfsd4_del_conns(ses);
+        spin_lock(&nfsd_drc_lock);
+        mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
+        nfsd_drc_mem_used -= mem;
+        spin_unlock(&nfsd_drc_lock);
+        free_session_slots(ses);
+        kfree(ses);
+}
+static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+{
+        struct nfsd4_session *new;
+        struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
+        int numslots, slotsize;
+        int status;
+        int idx;
+        /*
+         * Note decreasing slot size below client's request may
+         * make it difficult for client to function correctly, whereas
+         * decreasing the number of slots will (just?) affect
+         * performance.  When short on memory we therefore prefer to
+         * decrease number of slots instead of their size.
+         */
+        slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
+        numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
+        new = alloc_session(slotsize, numslots);
+        if (!new) {
+                nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
+                return NULL;
        }
+        init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
        new->se_client = clp;
        gen_sessionid(new);
-        idx = hash_sessionid(&new->se_sessionid);
-        memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
-               NFS4_MAX_SESSIONID_LEN);
+        INIT_LIST_HEAD(&new->se_conns);
+        new->se_cb_seq_nr = 1;
        new->se_flags = cses->flags;
+        new->se_cb_prog = cses->callback_prog;
        kref_init(&new->se_ref);
+        idx = hash_sessionid(&new->se_sessionid);
        spin_lock(&client_lock);
        list_add(&new->se_hash, &sessionid_hashtbl[idx]);
        list_add(&new->se_perclnt, &clp->cl_sessions);
        spin_unlock(&client_lock);
-        status = nfs_ok;
+        status = nfsd4_new_conn(rqstp, new);
-out:
+        /* whoops: benny points out, status is ignored! (err, or bogus) */
-        return status;
+        if (status) {
-out_free:
+                free_session(&new->se_ref);
-        free_session_slots(new);
+                return NULL;
-        kfree(new);
+        }
-        goto out;
+        if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) {
+                struct sockaddr *sa = svc_addr(rqstp);
+                clp->cl_cb_session = new;
+                clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
+                svc_xprt_get(rqstp->rq_xprt);
+                rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
+                clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
+                nfsd4_probe_callback(clp);
+        }
+        return new;
 }
 /* caller must hold client_lock */
@@ -731,21 +816,6 @@ unhash_session(struct nfsd4_session *ses)
        list_del(&ses->se_perclnt);
 }
-void
-free_session(struct kref *kref)
-{
-        struct nfsd4_session *ses;
-        int mem;
-        ses = container_of(kref, struct nfsd4_session, se_ref);
-        spin_lock(&nfsd_drc_lock);
-        mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
-        nfsd_drc_mem_used -= mem;
-        spin_unlock(&nfsd_drc_lock);
-        free_session_slots(ses);
-        kfree(ses);
-}
 /* must be called under the client_lock */
 static inline void
 renew_client_locked(struct nfs4_client *clp)
@@ -812,6 +882,13 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 static inline void
 free_client(struct nfs4_client *clp)
 {
+        while (!list_empty(&clp->cl_sessions)) {
+                struct nfsd4_session *ses;
+                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+                                se_perclnt);
+                list_del(&ses->se_perclnt);
+                nfsd4_put_session(ses);
+        }
        if (clp->cl_cred.cr_group_info)
                put_group_info(clp->cl_cred.cr_group_info);
        kfree(clp->cl_principal);
@@ -838,15 +915,12 @@ release_session_client(struct nfsd4_session *session)
 static inline void
 unhash_client_locked(struct nfs4_client *clp)
 {
+        struct nfsd4_session *ses;
        mark_client_expired(clp);
        list_del(&clp->cl_lru);
-        while (!list_empty(&clp->cl_sessions)) {
+        list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
-                struct nfsd4_session  *ses;
+                list_del_init(&ses->se_hash);
-                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
-                                 se_perclnt);
-                unhash_session(ses);
-                nfsd4_put_session(ses);
-        }
 }
 static void
@@ -875,7 +949,7 @@ expire_client(struct nfs4_client *clp)
                sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
                release_openowner(sop);
        }
-        nfsd4_set_callback_client(clp, NULL);
+        nfsd4_shutdown_callback(clp);
        if (clp->cl_cb_conn.cb_xprt)
                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
        list_del(&clp->cl_idhash);
@@ -960,6 +1034,8 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        if (clp == NULL)
                return NULL;
+        INIT_LIST_HEAD(&clp->cl_sessions);
        princ = svc_gss_principal(rqstp);
        if (princ) {
                clp->cl_principal = kstrdup(princ, GFP_KERNEL);
@@ -976,8 +1052,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
-        INIT_LIST_HEAD(&clp->cl_sessions);
        INIT_LIST_HEAD(&clp->cl_lru);
+        spin_lock_init(&clp->cl_lock);
+        INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
        clp->cl_time = get_seconds();
        clear_bit(0, &clp->cl_cb_slot_busy);
        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
@@ -986,7 +1063,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        clp->cl_flavor = rqstp->rq_flavor;
        copy_cred(&clp->cl_cred, &rqstp->rq_cred);
        gen_confirm(clp);
+        clp->cl_cb_session = NULL;
        return clp;
 }
@@ -1098,7 +1175,7 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
 static void
 gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
 {
-        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
+        struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
        unsigned short expected_family;
        /* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1111,24 +1188,23 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
        else
                goto out_err;
-        cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
+        conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
                                            se->se_callback_addr_len,
-                                            (struct sockaddr *) &cb->cb_addr,
+                                            (struct sockaddr *)&conn->cb_addr,
-                                            sizeof(cb->cb_addr));
+                                            sizeof(conn->cb_addr));
-        if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family)
+        if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family)
                goto out_err;
-        if (cb->cb_addr.ss_family == AF_INET6)
+        if (conn->cb_addr.ss_family == AF_INET6)
-                ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid;
+                ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid;
-        cb->cb_minorversion = 0;
+        conn->cb_prog = se->se_callback_prog;
-        cb->cb_prog = se->se_callback_prog;
+        conn->cb_ident = se->se_callback_ident;
-        cb->cb_ident = se->se_callback_ident;
        return;
 out_err:
-        cb->cb_addr.ss_family = AF_UNSPEC;
+        conn->cb_addr.ss_family = AF_UNSPEC;
-        cb->cb_addrlen = 0;
+        conn->cb_addrlen = 0;
        dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
                "will not receive delegations\n",
                clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
@@ -1415,7 +1491,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 {
        struct sockaddr *sa = svc_addr(rqstp);
        struct nfs4_client *conf, *unconf;
+        struct nfsd4_session *new;
        struct nfsd4_clid_slot *cs_slot = NULL;
+        bool confirm_me = false;
        int status = 0;
        nfs4_lock_state();
@@ -1438,7 +1516,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                                cs_slot->sl_seqid, cr_ses->seqid);
                        goto out;
                }
-                cs_slot->sl_seqid++;
        } else if (unconf) {
                if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
                    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1451,25 +1528,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                if (status) {
                        /* an unconfirmed replay returns misordered */
                        status = nfserr_seq_misordered;
-                        goto out_cache;
+                        goto out;
                }
-                cs_slot->sl_seqid++; /* from 0 to 1 */
+                confirm_me = true;
-                move_to_confirmed(unconf);
-                if (cr_ses->flags & SESSION4_BACK_CHAN) {
-                        unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
-                        svc_xprt_get(rqstp->rq_xprt);
-                        rpc_copy_addr(
-                                (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
-                                sa);
-                        unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
-                        unconf->cl_cb_conn.cb_minorversion =
-                                cstate->minorversion;
-                        unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
-                        unconf->cl_cb_seq_nr = 1;
-                        nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
-                }
                conf = unconf;
        } else {
                status = nfserr_stale_clientid;
@@ -1477,22 +1539,30 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        }
        /*
+         * XXX: we should probably set this at creation time, and check
+         * for consistent minorversion use throughout:
+         */
+        conf->cl_minorversion = 1;
+        /*
         * We do not support RDMA or persistent sessions
         */
        cr_ses->flags &= ~SESSION4_PERSIST;
        cr_ses->flags &= ~SESSION4_RDMA;
-        status = alloc_init_session(rqstp, conf, cr_ses);
+        status = nfserr_jukebox;
-        if (status)
+        new = alloc_init_session(rqstp, conf, cr_ses);
+        if (!new)
                goto out;
+        status = nfs_ok;
-        memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
+        memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
               NFS4_MAX_SESSIONID_LEN);
+        cs_slot->sl_seqid++;
        cr_ses->seqid = cs_slot->sl_seqid;
-out_cache:
        /* cache solo and embedded create sessions under the state lock */
        nfsd4_cache_create_session(cr_ses, cs_slot, status);
+        if (confirm_me)
+                move_to_confirmed(conf);
 out:
        nfs4_unlock_state();
        dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1546,8 +1616,11 @@ nfsd4_destroy_session(struct svc_rqst *r,
        nfs4_lock_state();
        /* wait for callbacks */
-        nfsd4_set_callback_client(ses->se_client, NULL);
+        nfsd4_shutdown_callback(ses->se_client);
        nfs4_unlock_state();
+        nfsd4_del_conns(ses);
        nfsd4_put_session(ses);
        status = nfs_ok;
 out:
@@ -1555,6 +1628,36 @@ out:
        return status;
 }
+static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
+{
+        struct nfsd4_conn *c;
+        list_for_each_entry(c, &s->se_conns, cn_persession) {
+                if (c->cn_xprt == xpt) {
+                        return c;
+                }
+        }
+        return NULL;
+}
+static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
+{
+        struct nfs4_client *clp = ses->se_client;
+        struct nfsd4_conn *c;
+        spin_lock(&clp->cl_lock);
+        c = __nfsd4_find_conn(new->cn_xprt, ses);
+        if (c) {
+                spin_unlock(&clp->cl_lock);
+                free_conn(new);
+                return;
+        }
+        __nfsd4_hash_conn(new, ses);
+        spin_unlock(&clp->cl_lock);
+        nfsd4_register_conn(new);
+        return;
+}
 __be32
 nfsd4_sequence(struct svc_rqst *rqstp,
               struct nfsd4_compound_state *cstate,
@@ -1563,11 +1666,20 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        struct nfsd4_compoundres *resp = rqstp->rq_resp;
        struct nfsd4_session *session;
        struct nfsd4_slot *slot;
+        struct nfsd4_conn *conn;
        int status;
        if (resp->opcnt != 1)
                return nfserr_sequence_pos;
+        /*
+         * Will be either used or freed by nfsd4_sequence_check_conn
+         * below.
+         */
+        conn = alloc_conn(rqstp, NFS4_CDFC4_FORE);
+        if (!conn)
+                return nfserr_jukebox;
        spin_lock(&client_lock);
        status = nfserr_badsession;
        session = find_in_sessionid_hashtbl(&seq->sessionid);
@@ -1599,6 +1711,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        if (status)
                goto out;
+        nfsd4_sequence_check_conn(conn, session);
+        conn = NULL;
        /* Success! bump slot seqid */
        slot->sl_inuse = true;
        slot->sl_seqid = seq->seqid;
@@ -1613,6 +1728,7 @@ out:
                nfsd4_get_session(cstate->session);
                atomic_inc(&session->se_client->cl_refcount);
        }
+        kfree(conn);
        spin_unlock(&client_lock);
        dprintk("%s: return %d\n", __func__, ntohl(status));
        return status;
@@ -1747,6 +1863,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
                gen_clid(new);
        }
+        /*
+         * XXX: we should probably set this at creation time, and check
+         * for consistent minorversion use throughout:
+         */
+        new->cl_minorversion = 0;
        gen_callback(new, setclid, rpc_get_scope_id(sa));
        add_to_unconfirmed(new, strhashval);
        setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
@@ -1807,7 +1928,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        status = nfserr_clid_inuse;
                else {
                        atomic_set(&conf->cl_cb_set, 0);
-                        nfsd4_probe_callback(conf, &unconf->cl_cb_conn);
+                        nfsd4_change_callback(conf, &unconf->cl_cb_conn);
+                        nfsd4_probe_callback(conf);
                        expire_client(unconf);
                        status = nfs_ok;
@@ -1841,7 +1963,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        }
                        move_to_confirmed(unconf);
                        conf = unconf;
-                        nfsd4_probe_callback(conf, &conf->cl_cb_conn);
+                        nfsd4_probe_callback(conf);
                        status = nfs_ok;
                }
        } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -2492,7 +2614,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        struct nfs4_delegation *dp;
        struct nfs4_stateowner *sop = stp->st_stateowner;
        int cb_up = atomic_read(&sop->so_client->cl_cb_set);
-        struct file_lock fl, *flp = &fl;
+        struct file_lock *fl;
        int status, flag = 0;
        flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2526,20 +2648,24 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
                flag = NFS4_OPEN_DELEGATE_NONE;
                goto out;
        }
-        locks_init_lock(&fl);
+        status = -ENOMEM;
-        fl.fl_lmops = &nfsd_lease_mng_ops;
+        fl = locks_alloc_lock();
-        fl.fl_flags = FL_LEASE;
+        if (!fl)
-        fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+                goto out;
-        fl.fl_end = OFFSET_MAX;
+        locks_init_lock(fl);
-        fl.fl_owner =  (fl_owner_t)dp;
+        fl->fl_lmops = &nfsd_lease_mng_ops;
-        fl.fl_file = find_readable_file(stp->st_file);
+        fl->fl_flags = FL_LEASE;
-        BUG_ON(!fl.fl_file);
+        fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
-        fl.fl_pid = current->tgid;
+        fl->fl_end = OFFSET_MAX;
+        fl->fl_owner =  (fl_owner_t)dp;
+        fl->fl_file = find_readable_file(stp->st_file);
+        BUG_ON(!fl->fl_file);
+        fl->fl_pid = current->tgid;
        /* vfs_setlease checks to see if delegation should be handed out.
         * the lock_manager callbacks fl_mylease and fl_change are used
         */
-        if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) {
+        if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
                dprintk("NFSD: setlease failed [%d], no delegation\n", status);
                unhash_delegation(dp);
                flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2944,7 +3070,11 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
        if (STALE_STATEID(stateid)) 
                goto out;
-        status = nfserr_bad_stateid;
+        /*
+         * We assume that any stateid that has the current boot time,
+         * but that we can't find, is expired:
+         */
+        status = nfserr_expired;
        if (is_delegation_stateid(stateid)) {
                dp = find_delegation_stateid(ino, stateid);
                if (!dp)
@@ -2964,6 +3094,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                stp = find_stateid(stateid, flags);
                if (!stp)
                        goto out;
+                status = nfserr_bad_stateid;
                if (nfs4_check_fh(current_fh, stp))
                        goto out;
                if (!stp->st_stateowner->so_confirmed)
@@ -3038,8 +3169,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
                 * a replayed close:
                 */
                sop = search_close_lru(stateid->si_stateownerid, flags);
+                /* It's not stale; let's assume it's expired: */
                if (sop == NULL)
-                        return nfserr_bad_stateid;
+                        return nfserr_expired;
                *sopp = sop;
                goto check_replay;
        }
@@ -3304,6 +3436,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        status = nfserr_bad_stateid;
        if (!is_delegation_stateid(stateid))
                goto out;
+        status = nfserr_expired;
        dp = find_delegation_stateid(inode, stateid);
        if (!dp)
                goto out;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1a468bbd330f..f35a94a04026 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1805,19 +1805,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                                goto out_nfserr;
                }
        }
-        if ((buflen -= 16) < 0)
-                goto out_resource;
-        if (unlikely(bmval2)) {
+        if (bmval2) {
+                if ((buflen -= 16) < 0)
+                        goto out_resource;
                WRITE32(3);
                WRITE32(bmval0);
                WRITE32(bmval1);
                WRITE32(bmval2);
-        } else if (likely(bmval1)) {
+        } else if (bmval1) {
+                if ((buflen -= 12) < 0)
+                        goto out_resource;
                WRITE32(2);
                WRITE32(bmval0);
                WRITE32(bmval1);
        } else {
+                if ((buflen -= 8) < 0)
+                        goto out_resource;
                WRITE32(1);
                WRITE32(bmval0);
        }
@@ -1828,15 +1832,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                u32 word1 = nfsd_suppattrs1(minorversion);
                u32 word2 = nfsd_suppattrs2(minorversion);
-                if ((buflen -= 12) < 0)
-                        goto out_resource;
                if (!aclsupport)
                        word0 &= ~FATTR4_WORD0_ACL;
                if (!word2) {
+                        if ((buflen -= 12) < 0)
+                                goto out_resource;
                        WRITE32(2);
                        WRITE32(word0);
                        WRITE32(word1);
                } else {
+                        if ((buflen -= 16) < 0)
+                                goto out_resource;
                        WRITE32(3);
                        WRITE32(word0);
                        WRITE32(word1);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 06fa87e52e82..4514ebbee4d6 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -22,6 +22,7 @@
 */
 enum {
        NFSD_Root = 1,
+#ifdef CONFIG_NFSD_DEPRECATED
        NFSD_Svc,
        NFSD_Add,
        NFSD_Del,
@@ -29,6 +30,7 @@ enum {
        NFSD_Unexport,
        NFSD_Getfd,
        NFSD_Getfs,
+#endif
        NFSD_List,
        NFSD_Export_features,
        NFSD_Fh,
@@ -54,6 +56,7 @@ enum {
 /*
 * write() for these nodes.
 */
+#ifdef CONFIG_NFSD_DEPRECATED
 static ssize_t write_svc(struct file *file, char *buf, size_t size);
 static ssize_t write_add(struct file *file, char *buf, size_t size);
 static ssize_t write_del(struct file *file, char *buf, size_t size);
@@ -61,6 +64,7 @@ static ssize_t write_export(struct file *file, char *buf, size_t size);
 static ssize_t write_unexport(struct file *file, char *buf, size_t size);
 static ssize_t write_getfd(struct file *file, char *buf, size_t size);
 static ssize_t write_getfs(struct file *file, char *buf, size_t size);
+#endif
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
 static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
 static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
@@ -76,6 +80,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
 static ssize_t (*write_op[])(struct file *, char *, size_t) = {
+#ifdef CONFIG_NFSD_DEPRECATED
        [NFSD_Svc] = write_svc,
        [NFSD_Add] = write_add,
        [NFSD_Del] = write_del,
@@ -83,6 +88,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_Unexport] = write_unexport,
        [NFSD_Getfd] = write_getfd,
        [NFSD_Getfs] = write_getfs,
+#endif
        [NFSD_Fh] = write_filehandle,
        [NFSD_FO_UnlockIP] = write_unlock_ip,
        [NFSD_FO_UnlockFS] = write_unlock_fs,
@@ -121,6 +127,14 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
 static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
 {
+        static int warned;
+        if (file->f_dentry->d_name.name[0] == '.' && !warned) {
+                printk(KERN_INFO
+                       "Warning: \"%s\" uses deprecated NFSD interface: %s."
+                       "  This will be removed in 2.6.40\n",
+                       current->comm, file->f_dentry->d_name.name);
+                warned = 1;
+        }
        if (! file->private_data) {
                /* An attempt to read a transaction file without writing
                 * causes a 0-byte write so that the file can return
@@ -187,6 +201,7 @@ static const struct file_operations pool_stats_operations = {
 * payload - write methods
 */
+#ifdef CONFIG_NFSD_DEPRECATED
 /**
 * write_svc - Start kernel's NFSD server
 *
@@ -402,7 +417,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
        ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
-        clp = auth_unix_lookup(&in6);
+        clp = auth_unix_lookup(&init_net, &in6);
        if (!clp)
                err = -EPERM;
        else {
@@ -465,7 +480,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
        ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
-        clp = auth_unix_lookup(&in6);
+        clp = auth_unix_lookup(&init_net, &in6);
        if (!clp)
                err = -EPERM;
        else {
@@ -482,6 +497,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 out:
        return err;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 /**
 * write_unlock_ip - Release all locks used by a client
@@ -1000,12 +1016,12 @@ static ssize_t __write_ports_addxprt(char *buf)
        if (err != 0)
                return err;
-        err = svc_create_xprt(nfsd_serv, transport,
+        err = svc_create_xprt(nfsd_serv, transport, &init_net,
                                PF_INET, port, SVC_SOCK_ANONYMOUS);
        if (err < 0)
                goto out_err;
-        err = svc_create_xprt(nfsd_serv, transport,
+        err = svc_create_xprt(nfsd_serv, transport, &init_net,
                                PF_INET6, port, SVC_SOCK_ANONYMOUS);
        if (err < 0 && err != -EAFNOSUPPORT)
                goto out_close;
@@ -1356,6 +1372,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 {
        static struct tree_descr nfsd_files[] = {
+#ifdef CONFIG_NFSD_DEPRECATED
                [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR},
                [NFSD_Add] = {".add", &transaction_ops, S_IWUSR},
                [NFSD_Del] = {".del", &transaction_ops, S_IWUSR},
@@ -1363,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR},
                [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
+#endif
                [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
                [NFSD_Export_features] = {"export_features",
                                        &export_features_operations, S_IRUGO},
@@ -1387,16 +1405,16 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
        return simple_fill_super(sb, 0x6e667364, nfsd_files);
 }
-static int nfsd_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfsd_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt);
+        return mount_single(fs_type, flags, data, nfsd_fill_super);
 }
 static struct file_system_type nfsd_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfsd",
-        .get_sb         = nfsd_get_sb,
+        .mount          = nfsd_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index b76ac3a82e39..6b641cf2c19a 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -249,7 +249,7 @@ extern time_t nfsd4_grace;
 #define COMPOUND_SLACK_SPACE            140    /* OP_GETFH */
 #define COMPOUND_ERR_SLACK_SPACE        12     /* OP_SETATTR */
-#define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
+#define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
 /*
 * The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index e2c43464f237..2bae1d86f5f2 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -16,6 +16,7 @@
 #include <linux/lockd/bind.h>
 #include <linux/nfsacl.h>
 #include <linux/seq_file.h>
+#include <net/net_namespace.h>
 #include "nfsd.h"
 #include "cache.h"
 #include "vfs.h"
@@ -186,12 +187,12 @@ static int nfsd_init_socks(int port)
        if (!list_empty(&nfsd_serv->sv_permsocks))
                return 0;
-        error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
+        error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
-        error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
+        error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 322518c88e4b..39adc27b0685 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
 #ifndef _NFSD4_STATE_H
 #define _NFSD4_STATE_H
+#include <linux/sunrpc/svc_xprt.h>
 #include <linux/nfsd/nfsfh.h>
 #include "nfsfh.h"
@@ -64,19 +65,12 @@ typedef struct {
        (s)->si_fileid, \
        (s)->si_generation
-struct nfsd4_cb_sequence {
-        /* args/res */
-        u32                     cbs_minorversion;
-        struct nfs4_client      *cbs_clp;
-};
-struct nfs4_rpc_args {
-        void                            *args_op;
-        struct nfsd4_cb_sequence        args_seq;
-};
 struct nfsd4_callback {
-        struct nfs4_rpc_args cb_args;
+        void *cb_op;
+        struct nfs4_client *cb_clp;
+        u32 cb_minorversion;
+        struct rpc_message cb_msg;
+        const struct rpc_call_ops *cb_ops;
        struct work_struct cb_work;
 };
@@ -91,7 +85,6 @@ struct nfs4_delegation {
        u32                     dl_type;
        time_t                  dl_time;
 /* For recall: */
-        u32                     dl_ident;
        stateid_t               dl_stateid;
        struct knfsd_fh         dl_fh;
        int                     dl_retries;
@@ -103,8 +96,8 @@ struct nfs4_cb_conn {
        /* SETCLIENTID info */
        struct sockaddr_storage cb_addr;
        size_t                  cb_addrlen;
-        u32                     cb_prog;
+        u32                     cb_prog; /* used only in 4.0 case;
-        u32                     cb_minorversion;
+                                            per-session otherwise */
        u32                     cb_ident;       /* minorversion 0 only */
        struct svc_xprt         *cb_xprt;       /* minorversion 1 only */
 };
@@ -160,6 +153,15 @@ struct nfsd4_clid_slot {
        struct nfsd4_create_session     sl_cr_ses;
 };
+struct nfsd4_conn {
+        struct list_head cn_persession;
+        struct svc_xprt *cn_xprt;
+        struct svc_xpt_user cn_xpt_user;
+        struct nfsd4_session *cn_session;
+/* CDFC4_FORE, CDFC4_BACK: */
+        unsigned char cn_flags;
+};
 struct nfsd4_session {
        struct kref             se_ref;
        struct list_head        se_hash;        /* hash by sessionid */
@@ -169,6 +171,9 @@ struct nfsd4_session {
        struct nfs4_sessionid   se_sessionid;
        struct nfsd4_channel_attrs se_fchannel;
        struct nfsd4_channel_attrs se_bchannel;
+        struct list_head        se_conns;
+        u32                     se_cb_prog;
+        u32                     se_cb_seq_nr;
        struct nfsd4_slot       *se_slots[];    /* forward channel slots */
 };
@@ -221,24 +226,32 @@ struct nfs4_client {
        clientid_t              cl_clientid;    /* generated by server */
        nfs4_verifier           cl_confirm;     /* generated by server */
        u32                     cl_firststate;  /* recovery dir creation */
+        u32                     cl_minorversion;
        /* for v4.0 and v4.1 callbacks: */
        struct nfs4_cb_conn     cl_cb_conn;
+#define NFSD4_CLIENT_CB_UPDATE  1
+#define NFSD4_CLIENT_KILL       2
+        unsigned long           cl_cb_flags;
        struct rpc_clnt         *cl_cb_client;
+        u32                     cl_cb_ident;
        atomic_t                cl_cb_set;
+        struct nfsd4_callback   cl_cb_null;
+        struct nfsd4_session    *cl_cb_session;
+        /* for all client information that callback code might need: */
+        spinlock_t              cl_lock;
        /* for nfs41 */
        struct list_head        cl_sessions;
        struct nfsd4_clid_slot  cl_cs_slot;     /* create_session slot */
        u32                     cl_exchange_flags;
-        struct nfs4_sessionid   cl_sessionid;
        /* number of rpc's in progress over an associated session: */
        atomic_t                cl_refcount;
        /* for nfs41 callbacks */
        /* We currently support a single back channel with a single slot */
        unsigned long           cl_cb_slot_busy;
-        u32                     cl_cb_seq_nr;
        struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
                                                /* wait here for slots */
 };
@@ -440,12 +453,13 @@ extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
 extern void nfs4_free_stateowner(struct kref *kref);
 extern int set_callback_cred(void);
-extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
 extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
-extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *);
+extern void nfsd4_shutdown_callback(struct nfs4_client *);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
 extern void nfsd4_init_recdir(char *recdir_name);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 661a6cf8e826..184938fcff04 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -281,23 +281,13 @@ commit_metadata(struct svc_fh *fhp)
 {
        struct inode *inode = fhp->fh_dentry->d_inode;
        const struct export_operations *export_ops = inode->i_sb->s_export_op;
-        int error = 0;
        if (!EX_ISSYNC(fhp->fh_export))
                return 0;
-        if (export_ops->commit_metadata) {
+        if (export_ops->commit_metadata)
-                error = export_ops->commit_metadata(inode);
+                return export_ops->commit_metadata(inode);
-        } else {
+        return sync_inode_metadata(inode, 1);
-                struct writeback_control wbc = {
-                        .sync_mode = WB_SYNC_ALL,
-                        .nr_to_write = 0, /* metadata only */
-                };
-                error = sync_inode(inode, &wbc);
-        }
-        return error;
 }
 /*
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 185d1607cb00..6e9557ecf161 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -207,7 +207,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
        inode->i_ctime = CURRENT_TIME;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = nilfs_add_nondir(dentry, inode);
        if (!err)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index d926af626177..687d090cea34 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1609,7 +1609,7 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
        kunmap_atomic(kaddr, KM_USER0);
        if (!TestSetPageWriteback(clone_page))
-                inc_zone_page_state(clone_page, NR_WRITEBACK);
+                account_page_writeback(clone_page);
        unlock_page(clone_page);
        return 0;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 35ae03c0db86..f804d41ec9d3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1141,9 +1141,9 @@ static int nilfs_test_bdev_super(struct super_block *s, void *data)
        return (void *)s->s_bdev == data;
 }
-static int
+static struct dentry *
-nilfs_get_sb(struct file_system_type *fs_type, int flags,
+nilfs_mount(struct file_system_type *fs_type, int flags,
-             const char *dev_name, void *data, struct vfsmount *mnt)
+             const char *dev_name, void *data)
 {
        struct nilfs_super_data sd;
        struct super_block *s;
@@ -1156,7 +1156,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
        if (IS_ERR(sd.bdev))
-                return PTR_ERR(sd.bdev);
+                return ERR_CAST(sd.bdev);
        sd.cno = 0;
        sd.flags = flags;
@@ -1235,9 +1235,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        if (!s_new)
                close_bdev_exclusive(sd.bdev, mode);
-        mnt->mnt_sb = s;
+        return root_dentry;
-        mnt->mnt_root = root_dentry;
-        return 0;
 failed_super:
        deactivate_locked_super(s);
@@ -1245,13 +1243,13 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 failed:
        if (!s_new)
                close_bdev_exclusive(sd.bdev, mode);
-        return err;
+        return ERR_PTR(err);
 }
 struct file_system_type nilfs_fs_type = {
        .owner    = THIS_MODULE,
        .name     = "nilfs2",
-        .get_sb   = nilfs_get_sb,
+        .mount    = nilfs_mount,
        .kill_sb  = kill_block_super,
        .fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 36802420d69a..4498a208df94 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -88,8 +88,6 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
        struct dentry *parent;
        struct inode *p_inode;
-        bool send = false;
-        bool should_update_children = false;
        if (!dentry)
                dentry = path->dentry;
@@ -97,29 +95,12 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
        if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
                return;
-        spin_lock(&dentry->d_lock);
+        parent = dget_parent(dentry);
-        parent = dentry->d_parent;
        p_inode = parent->d_inode;
-        if (fsnotify_inode_watches_children(p_inode)) {
+        if (unlikely(!fsnotify_inode_watches_children(p_inode)))
-                if (p_inode->i_fsnotify_mask & mask) {
+                __fsnotify_update_child_dentry_flags(p_inode);
-                        dget(parent);
+        else if (p_inode->i_fsnotify_mask & mask) {
-                        send = true;
-                }
-        } else {
-                /*
-                 * The parent doesn't care about events on it's children but
-                 * at least one child thought it did.  We need to run all the
-                 * children and update their d_flags to let them know p_inode
-                 * doesn't care about them any more.
-                 */
-                dget(parent);
-                should_update_children = true;
-        }
-        spin_unlock(&dentry->d_lock);
-        if (send) {
                /* we are notifying a parent so come up with the new mask which
                 * specifies these are events which came from a child. */
                mask |= FS_EVENT_ON_CHILD;
@@ -130,13 +111,9 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
                else
                        fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
                                 dentry->d_name.name, 0);
-                dput(parent);
        }
-        if (unlikely(should_update_children)) {
+        dput(parent);
-                __fsnotify_update_child_dentry_flags(p_inode);
-                dput(parent);
-        }
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 33297c005060..21ed10660b80 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -240,6 +240,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
 {
        struct inode *inode, *next_i, *need_iput = NULL;
+        spin_lock(&inode_lock);
        list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
                struct inode *need_iput_tmp;
@@ -297,4 +298,5 @@ void fsnotify_unmount_inodes(struct list_head *list)
                spin_lock(&inode_lock);
        }
+        spin_unlock(&inode_lock);
 }
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 19c5180f8a28..a30ecacc01f2 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2911,8 +2911,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
                goto unl_upcase_iput_tmp_ino_err_out_now;
        }
        if ((sb->s_root = d_alloc_root(vol->root_ino))) {
-                /* We increment i_count simulating an ntfs_iget(). */
+                /* We grab a reference, simulating an ntfs_iget(). */
-                atomic_inc(&vol->root_ino->i_count);
+                ihold(vol->root_ino);
                ntfs_debug("Exiting, status successful.");
                /* Release the default upcase if it has no users. */
                mutex_lock(&ntfs_lock);
@@ -3021,21 +3021,6 @@ iput_tmp_ino_err_out_now:
        if (vol->mft_ino && vol->mft_ino != tmp_ino)
                iput(vol->mft_ino);
        vol->mft_ino = NULL;
-        /*
-         * This is needed to get ntfs_clear_extent_inode() called for each
-         * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
-         * leak resources and B) a subsequent mount fails automatically due to
-         * ntfs_iget() never calling down into our ntfs_read_locked_inode()
-         * method again... FIXME: Do we need to do this twice now because of
-         * attribute inodes? I think not, so leave as is for now... (AIA)
-         */
-        if (invalidate_inodes(sb)) {
-                ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
-                                "driver bug.");
-                /* Copied from fs/super.c. I just love this message. (-; */
-                printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
-                                "seconds.  Have a nice day...\n");
-        }
        /* Errors at this stage are irrelevant. */
 err_out_now:
        sb->s_fs_info = NULL;
@@ -3074,17 +3059,16 @@ struct kmem_cache *ntfs_index_ctx_cache;
 /* Driver wide mutex. */
 DEFINE_MUTEX(ntfs_lock);
-static int ntfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *ntfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
-                           mnt);
 }
 static struct file_system_type ntfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ntfs",
-        .get_sb         = ntfs_get_sb,
+        .mount          = ntfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 5cfeee118158..f1e962cb3b73 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -165,7 +165,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
         * ocfs2 never allocates in this function - the only time we
         * need to use BH_New is when we're extending i_size on a file
         * system which doesn't support holes, in which case BH_New
-         * allows block_prepare_write() to zero.
+         * allows __block_write_begin() to zero.
         *
         * If we see this on a sparse file system, then a truncate has
         * raced us and removed the cluster. In this case, we clear
@@ -407,21 +407,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
        return ret;
 }
-/*
- * This is called from ocfs2_write_zero_page() which has handled it's
- * own cluster locking and has ensured allocation exists for those
- * blocks to be written.
- */
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
-                               unsigned from, unsigned to)
-{
-        int ret;
-        ret = block_prepare_write(page, from, to, ocfs2_get_block);
-        return ret;
-}
 /* Taken from ext3. We don't necessarily need the full blown
 * functionality yet, but IMHO it's better to cut and paste the whole
 * thing so we can avoid introducing our own bugs (and easily pick up
@@ -732,7 +717,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
 }
 /*
- * Some of this taken from block_prepare_write(). We already have our
+ * Some of this taken from __block_write_begin(). We already have our
 * mapping by now though, and the entire write will be allocating or
 * it won't, so not much need to use BH_New.
 *
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 7606f663da6d..76bfdfda691a 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,9 +22,6 @@
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
-                               unsigned from, unsigned to);
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                         struct page *page,
                                                         unsigned from,
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index a7ebd9d42dc8..b2df490a19ed 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -400,6 +400,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
        if (inode) {
                ip = DLMFS_I(inode);
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
@@ -425,6 +426,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
        if (!inode)
                return NULL;
+        inode->i_ino = get_next_ino();
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
@@ -641,16 +643,16 @@ static const struct inode_operations dlmfs_file_inode_operations = {
        .setattr        = dlmfs_file_setattr,
 };
-static int dlmfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
 }
 static struct file_system_type dlmfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ocfs2_dlmfs",
-        .get_sb         = dlmfs_get_sb,
+        .mount          = dlmfs_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1ca6867935bb..77b4c04a2809 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -796,13 +796,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
                block_end = block_start + (1 << inode->i_blkbits);
                /*
-                 * block_start is block-aligned.  Bump it by one to
+                 * block_start is block-aligned.  Bump it by one to force
-                 * force ocfs2_{prepare,commit}_write() to zero the
+                 * __block_write_begin and block_commit_write to zero the
                 * whole block.
                 */
-                ret = ocfs2_prepare_write_nolock(inode, page,
+                ret = __block_write_begin(page, block_start + 1, 0,
-                                                 block_start + 1,
+                                          ocfs2_get_block);
-                                                 block_start + 1);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out_unlock;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index e7bde21149ae..ff5744e1e36f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -742,7 +742,7 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_commit;
        }
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 56f0cb395820..f02c0ef31578 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1236,14 +1236,12 @@ read_super_error:
        return status;
 }
-static int ocfs2_get_sb(struct file_system_type *fs_type,
+static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
                        int flags,
                        const char *dev_name,
-                        void *data,
+                        void *data)
-                        struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
-                           mnt);
 }
 static void ocfs2_kill_sb(struct super_block *sb)
@@ -1267,8 +1265,7 @@ out:
 static struct file_system_type ocfs2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ocfs2",
-        .get_sb         = ocfs2_get_sb, /* is this called when we mount
+        .mount          = ocfs2_mount,
-                                        * the fs? */
        .kill_sb        = ocfs2_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 14a22863291a..e043c4cb9a97 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -557,17 +557,16 @@ end:
        return ret;
 }
-static int omfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *omfs_mount(struct file_system_type *fs_type,
-                        int flags, const char *dev_name,
+                        int flags, const char *dev_name, void *data)
-                        void *data, struct vfsmount *m)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m);
+        return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super);
 }
 static struct file_system_type omfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "omfs",
-        .get_sb = omfs_get_sb,
+        .mount = omfs_mount,
        .kill_sb = kill_block_super,
        .fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/open.c b/fs/open.c
index d74e1983e8dc..4197b9ed023d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -786,11 +786,11 @@ struct file *nameidata_to_filp(struct nameidata *nd)
        /* Pick up the filp from the open intent */
        filp = nd->intent.open.file;
        /* Has the filesystem initialised the file for us? */
-        if (filp->f_path.dentry == NULL)
+        if (filp->f_path.dentry == NULL) {
+                path_get(&nd->path);
                filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
                                     NULL, cred);
-        else
+        }
-                path_put(&nd->path);
        return filp;
 }
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ffcd04f0012c..ddb1f41376e5 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -415,16 +415,16 @@ out_no_root:
        return ret;
 }
-static int openprom_get_sb(struct file_system_type *fs_type,
+static struct dentry *openprom_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt);
+        return mount_single(fs_type, flags, data, openprom_fill_super)
 }
 static struct file_system_type openprom_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "openpromfs",
-        .get_sb         = openprom_get_sb,
+        .mount          = openprom_mount,
        .kill_sb        = kill_anon_super,
 };
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index b81bfc016a05..0a8b0ad0c7e2 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -365,25 +365,17 @@ struct device_type part_type = {
 static void delete_partition_rcu_cb(struct rcu_head *head)
 {
        struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
-        struct gendisk *disk = part_to_disk(part);
-        struct request_queue *q = disk->queue;
-        unsigned long flags;
        part->start_sect = 0;
        part->nr_sects = 0;
        part_stat_set_all(part, 0);
        put_device(part_to_dev(part));
-        spin_lock_irqsave(q->queue_lock, flags);
-        elv_quiesce_end(q);
-        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 void delete_partition(struct gendisk *disk, int partno)
 {
        struct disk_part_tbl *ptbl = disk->part_tbl;
        struct hd_struct *part;
-        struct request_queue *q = disk->queue;
        if (partno >= ptbl->len)
                return;
@@ -398,10 +390,6 @@ void delete_partition(struct gendisk *disk, int partno)
        kobject_put(part->holder_dir);
        device_del(part_to_dev(part));
-        spin_lock_irq(q->queue_lock);
-        elv_quiesce_start(q);
-        spin_unlock_irq(q->queue_lock);
        call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index 37eb1ebeaa90..a8012a955720 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -954,6 +954,8 @@ static struct inode * get_pipe_inode(void)
        if (!inode)
                goto fail_inode;
+        inode->i_ino = get_next_ino();
        pipe = alloc_pipe_info(inode);
        if (!pipe)
                goto fail_iput;
@@ -1245,16 +1247,15 @@ out:
 * any operations on the root directory. However, we need a non-trivial
 * d_name - pipe: will go nicely and kill the special-casing in procfs.
 */
-static int pipefs_get_sb(struct file_system_type *fs_type,
+static struct dentry *pipefs_mount(struct file_system_type *fs_type,
-                         int flags, const char *dev_name, void *data,
+                         int flags, const char *dev_name, void *data)
-                         struct vfsmount *mnt)
 {
-        return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
+        return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
 }
 static struct file_system_type pipe_fs_type = {
        .name           = "pipefs",
-        .get_sb         = pipefs_get_sb,
+        .mount          = pipefs_mount,
        .kill_sb        = kill_anon_super,
 };
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 50f8f0600f06..6a0068841d96 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -33,8 +33,8 @@ config PROC_KCORE
        depends on PROC_FS && MMU
 config PROC_VMCORE
-        bool "/proc/vmcore support (EXPERIMENTAL)"
+        bool "/proc/vmcore support"
-        depends on PROC_FS && CRASH_DUMP
+        depends on PROC_FS && CRASH_DUMP
        default y
        help
        Exports the dump image of crashed kernel in ELF format.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dc5d5f51f3fe..f3d02ca461ec 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -226,7 +226,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 {
        struct mm_struct *mm;
-        if (mutex_lock_killable(&task->cred_guard_mutex))
+        if (mutex_lock_killable(&task->signal->cred_guard_mutex))
                return NULL;
        mm = get_task_mm(task);
@@ -235,7 +235,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
                mmput(mm);
                mm = NULL;
        }
-        mutex_unlock(&task->cred_guard_mutex);
+        mutex_unlock(&task->signal->cred_guard_mutex);
        return mm;
 }
@@ -771,6 +771,8 @@ static const struct file_operations proc_single_file_operations = {
 static int mem_open(struct inode* inode, struct file* file)
 {
        file->private_data = (void*)((long)current->self_exec_id);
+        /* OK to pass negative loff_t, we can catch out-of-range */
+        file->f_mode |= FMODE_UNSIGNED_OFFSET;
        return 0;
 }
@@ -1023,28 +1025,47 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
-        if (copy_from_user(buffer, buf, count))
+        if (copy_from_user(buffer, buf, count)) {
-                return -EFAULT;
+                err = -EFAULT;
+                goto out;
+        }
        err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
        if (err)
-                return -EINVAL;
+                goto out;
        if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
-             oom_adjust != OOM_DISABLE)
+             oom_adjust != OOM_DISABLE) {
-                return -EINVAL;
+                err = -EINVAL;
+                goto out;
+        }
        task = get_proc_task(file->f_path.dentry->d_inode);
-        if (!task)
+        if (!task) {
-                return -ESRCH;
+                err = -ESRCH;
+                goto out;
+        }
+        task_lock(task);
+        if (!task->mm) {
+                err = -EINVAL;
+                goto err_task_lock;
+        }
        if (!lock_task_sighand(task, &flags)) {
-                put_task_struct(task);
+                err = -ESRCH;
-                return -ESRCH;
+                goto err_task_lock;
        }
        if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
-                unlock_task_sighand(task, &flags);
+                err = -EACCES;
-                put_task_struct(task);
+                goto err_sighand;
-                return -EACCES;
+        }
+        if (oom_adjust != task->signal->oom_adj) {
+                if (oom_adjust == OOM_DISABLE)
+                        atomic_inc(&task->mm->oom_disable_count);
+                if (task->signal->oom_adj == OOM_DISABLE)
+                        atomic_dec(&task->mm->oom_disable_count);
        }
        /*
@@ -1065,10 +1086,13 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        else
                task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
                                                                -OOM_DISABLE;
+err_sighand:
        unlock_task_sighand(task, &flags);
+err_task_lock:
+        task_unlock(task);
        put_task_struct(task);
+out:
-        return count;
+        return err < 0 ? err : count;
 }
 static const struct file_operations proc_oom_adjust_operations = {
@@ -1109,30 +1133,49 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
-        if (copy_from_user(buffer, buf, count))
+        if (copy_from_user(buffer, buf, count)) {
-                return -EFAULT;
+                err = -EFAULT;
+                goto out;
+        }
        err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
        if (err)
-                return -EINVAL;
+                goto out;
        if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
-                        oom_score_adj > OOM_SCORE_ADJ_MAX)
+                        oom_score_adj > OOM_SCORE_ADJ_MAX) {
-                return -EINVAL;
+                err = -EINVAL;
+                goto out;
+        }
        task = get_proc_task(file->f_path.dentry->d_inode);
-        if (!task)
+        if (!task) {
-                return -ESRCH;
+                err = -ESRCH;
+                goto out;
+        }
+        task_lock(task);
+        if (!task->mm) {
+                err = -EINVAL;
+                goto err_task_lock;
+        }
        if (!lock_task_sighand(task, &flags)) {
-                put_task_struct(task);
+                err = -ESRCH;
-                return -ESRCH;
+                goto err_task_lock;
        }
        if (oom_score_adj < task->signal->oom_score_adj &&
                        !capable(CAP_SYS_RESOURCE)) {
-                unlock_task_sighand(task, &flags);
+                err = -EACCES;
-                put_task_struct(task);
+                goto err_sighand;
-                return -EACCES;
        }
+        if (oom_score_adj != task->signal->oom_score_adj) {
+                if (oom_score_adj == OOM_SCORE_ADJ_MIN)
+                        atomic_inc(&task->mm->oom_disable_count);
+                if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                        atomic_dec(&task->mm->oom_disable_count);
+        }
        task->signal->oom_score_adj = oom_score_adj;
        /*
         * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
@@ -1143,9 +1186,13 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        else
                task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
                                                        OOM_SCORE_ADJ_MAX;
+err_sighand:
        unlock_task_sighand(task, &flags);
+err_task_lock:
+        task_unlock(task);
        put_task_struct(task);
-        return count;
+out:
+        return err < 0 ? err : count;
 }
 static const struct file_operations proc_oom_score_adj_operations = {
@@ -1601,6 +1648,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
        /* Common stuff */
        ei = PROC_I(inode);
+        inode->i_ino = get_next_ino();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_op = &proc_def_inode_operations;
@@ -2306,14 +2354,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
                goto out_free;
        /* Guard against adverse ptrace interaction */
-        length = mutex_lock_interruptible(&task->cred_guard_mutex);
+        length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
        if (length < 0)
                goto out_free;
        length = security_setprocattr(task,
                                      (char*)file->f_path.dentry->d_name.name,
                                      (void*)page, count);
-        mutex_unlock(&task->cred_guard_mutex);
+        mutex_unlock(&task->signal->cred_guard_mutex);
 out_free:
        free_page((unsigned long) page);
 out:
@@ -2547,6 +2595,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        /* Initialize the inode */
        ei = PROC_I(inode);
+        inode->i_ino = get_next_ino();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        /*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 2fc52552271d..b652cb00906b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -23,6 +23,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        if (!inode)
                goto out;
+        inode->i_ino = get_next_ino();
        sysctl_head_get(head);
        ei = PROC_I(inode);
        ei->sysctl = head;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 93d99b316325..ef9fa8e24ad6 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -35,8 +35,8 @@ static int proc_set_super(struct super_block *sb, void *data)
        return set_anon_super(sb, NULL);
 }
-static int proc_get_sb(struct file_system_type *fs_type,
+static struct dentry *proc_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
        int err;
        struct super_block *sb;
@@ -61,14 +61,14 @@ static int proc_get_sb(struct file_system_type *fs_type,
        sb = sget(fs_type, proc_test_super, proc_set_super, ns);
        if (IS_ERR(sb))
-                return PTR_ERR(sb);
+                return ERR_CAST(sb);
        if (!sb->s_root) {
                sb->s_flags = flags;
                err = proc_fill_super(sb);
                if (err) {
                        deactivate_locked_super(sb);
-                        return err;
+                        return ERR_PTR(err);
                }
                ei = PROC_I(sb->s_root->d_inode);
@@ -79,11 +79,9 @@ static int proc_get_sb(struct file_system_type *fs_type,
                }
                sb->s_flags |= MS_ACTIVE;
-                ns->proc_mnt = mnt;
        }
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 }
 static void proc_kill_sb(struct super_block *sb)
@@ -97,7 +95,7 @@ static void proc_kill_sb(struct super_block *sb)
 static struct file_system_type proc_fs_type = {
        .name           = "proc",
-        .get_sb         = proc_get_sb,
+        .mount          = proc_mount,
        .kill_sb        = proc_kill_sb,
 };
@@ -115,6 +113,7 @@ void __init proc_root_init(void)
                return;
        }
+        init_pid_ns.proc_mnt = proc_mnt;
        proc_symlink("mounts", NULL, "self/mounts");
        proc_net_init();
@@ -213,6 +212,7 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
+        ns->proc_mnt = mnt;
        return 0;
 }
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 1807c2419f17..37994737c983 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,13 +10,13 @@ static int show_softirqs(struct seq_file *p, void *v)
 {
        int i, j;
-        seq_printf(p, "                ");
+        seq_printf(p, "                    ");
        for_each_possible_cpu(i)
                seq_printf(p, "CPU%-8d", i);
        seq_printf(p, "\n");
        for (i = 0; i < NR_SOFTIRQS; i++) {
-                seq_printf(p, "%8s:", softirq_to_name[i]);
+                seq_printf(p, "%12s:", softirq_to_name[i]);
                for_each_possible_cpu(j)
                        seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
                seq_printf(p, "\n");
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bf31b03fc275..e15a19c93bae 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -31,7 +31,6 @@ static int show_stat(struct seq_file *p, void *v)
        u64 sum_softirq = 0;
        unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
        struct timespec boottime;
-        unsigned int per_irq_sum;
        user = nice = system = idle = iowait =
                irq = softirq = steal = cputime64_zero;
@@ -52,9 +51,7 @@ static int show_stat(struct seq_file *p, void *v)
                guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
                guest_nice = cputime64_add(guest_nice,
                        kstat_cpu(i).cpustat.guest_nice);
-                for_each_irq_nr(j) {
+                sum += kstat_cpu_irqs_sum(i);
-                        sum += kstat_irqs_cpu(j, i);
-                }
                sum += arch_irq_stat_cpu(i);
                for (j = 0; j < NR_SOFTIRQS; j++) {
@@ -110,13 +107,8 @@ static int show_stat(struct seq_file *p, void *v)
        seq_printf(p, "intr %llu", (unsigned long long)sum);
        /* sum again ? it could be updated? */
-        for_each_irq_nr(j) {
+        for_each_irq_nr(j)
-                per_irq_sum = 0;
+                seq_printf(p, " %u", kstat_irqs(j));
-                for_each_possible_cpu(i)
-                        per_irq_sum += kstat_irqs_cpu(j, i);
-                seq_printf(p, " %u", per_irq_sum);
-        }
        seq_printf(p,
                "\nctxt %llu\n"
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 871e25ed0069..da6b01d70f01 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -327,6 +327,7 @@ struct mem_size_stats {
        unsigned long private_clean;
        unsigned long private_dirty;
        unsigned long referenced;
+        unsigned long anonymous;
        unsigned long swap;
        u64 pss;
 };
@@ -357,6 +358,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                if (!page)
                        continue;
+                if (PageAnon(page))
+                        mss->anonymous += PAGE_SIZE;
                mss->resident += PAGE_SIZE;
                /* Accumulate the size in pages that have been accessed. */
                if (pte_young(ptent) || PageReferenced(page))
@@ -410,6 +414,7 @@ static int show_smap(struct seq_file *m, void *v)
                   "Private_Clean:  %8lu kB\n"
                   "Private_Dirty:  %8lu kB\n"
                   "Referenced:     %8lu kB\n"
+                   "Anonymous:      %8lu kB\n"
                   "Swap:           %8lu kB\n"
                   "KernelPageSize: %8lu kB\n"
                   "MMUPageSize:    %8lu kB\n",
@@ -421,6 +426,7 @@ static int show_smap(struct seq_file *m, void *v)
                   mss.private_clean >> 10,
                   mss.private_dirty >> 10,
                   mss.referenced >> 10,
+                   mss.anonymous >> 10,
                   mss.swap >> 10,
                   vma_kernel_pagesize(vma) >> 10,
                   vma_mmu_pagesize(vma) >> 10);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 01bad30026fc..fcada42f1aa3 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -454,17 +454,16 @@ static void destroy_inodecache(void)
        kmem_cache_destroy(qnx4_inode_cachep);
 }
-static int qnx4_get_sb(struct file_system_type *fs_type,
+static struct dentry *qnx4_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
-                           mnt);
 }
 static struct file_system_type qnx4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "qnx4",
-        .get_sb         = qnx4_get_sb,
+        .mount          = qnx4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 3e21b1e2ad3a..880fd9884366 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -4,6 +4,7 @@
 config QUOTA
        bool "Quota support"
+        select QUOTACTL
        help
          If you say Y here, you will be able to set per user limits for disk
          usage (also called disk quotas). Currently, it works for the
@@ -65,8 +66,7 @@ config QFMT_V2
 config QUOTACTL
        bool
-        depends on XFS_QUOTA || QUOTA
+        default n
-        default y
 config QUOTACTL_COMPAT
        bool
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index aad1316a977f..0fed41e6efcd 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1386,6 +1386,9 @@ static void __dquot_initialize(struct inode *inode, int type)
                /* Avoid races with quotaoff() */
                if (!sb_has_quota_active(sb, cnt))
                        continue;
+                /* We could race with quotaon or dqget() could have failed */
+                if (!got[cnt])
+                        continue;
                if (!inode->i_dquot[cnt]) {
                        inode->i_dquot[cnt] = got[cnt];
                        got[cnt] = NULL;
@@ -1736,6 +1739,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        qsize_t rsv_space = 0;
        struct dquot *transfer_from[MAXQUOTAS] = {};
        int cnt, ret = 0;
+        char is_valid[MAXQUOTAS] = {};
        char warntype_to[MAXQUOTAS];
        char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
@@ -1757,8 +1761,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        space = cur_space + rsv_space;
        /* Build the transfer_from list and check the limits */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                /*
+                 * Skip changes for same uid or gid or for turned off quota-type.
+                 */
                if (!transfer_to[cnt])
                        continue;
+                /* Avoid races with quotaoff() */
+                if (!sb_has_quota_active(inode->i_sb, cnt))
+                        continue;
+                is_valid[cnt] = 1;
                transfer_from[cnt] = inode->i_dquot[cnt];
                ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
                if (ret)
@@ -1772,12 +1783,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
         * Finally perform the needed transfer from transfer_from to transfer_to
         */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                /*
+                if (!is_valid[cnt])
-                 * Skip changes for same uid or gid or for turned off quota-type.
-                 */
-                if (!transfer_to[cnt])
                        continue;
                /* Due to IO error we might not have transfer_from[] structure */
                if (transfer_from[cnt]) {
                        warntype_from_inodes[cnt] =
@@ -1801,18 +1808,19 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        mark_all_dquot_dirty(transfer_from);
        mark_all_dquot_dirty(transfer_to);
-        /* Pass back references to put */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                transfer_to[cnt] = transfer_from[cnt];
-warn:
        flush_warnings(transfer_to, warntype_to);
        flush_warnings(transfer_from, warntype_from_inodes);
        flush_warnings(transfer_from, warntype_from_space);
-        return ret;
+        /* Pass back references to put */
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                if (is_valid[cnt])
+                        transfer_to[cnt] = transfer_from[cnt];
+        return 0;
 over_quota:
        spin_unlock(&dq_data_lock);
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        goto warn;
+        flush_warnings(transfer_to, warntype_to);
+        return ret;
 }
 EXPORT_SYMBOL(__dquot_transfer);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a5ebae70dc6d..eacb166fb259 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,6 +58,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
        struct inode * inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode_init_owner(inode, dir, mode);
                inode->i_mapping->a_ops = &ramfs_aops;
                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
@@ -254,17 +255,16 @@ fail:
        return err;
 }
-int ramfs_get_sb(struct file_system_type *fs_type,
+struct dentry *ramfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, ramfs_fill_super);
 }
-static int rootfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *rootfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super,
+        return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
-                            mnt);
 }
 static void ramfs_kill_sb(struct super_block *sb)
@@ -275,12 +275,12 @@ static void ramfs_kill_sb(struct super_block *sb)
 static struct file_system_type ramfs_fs_type = {
        .name           = "ramfs",
-        .get_sb         = ramfs_get_sb,
+        .mount          = ramfs_mount,
        .kill_sb        = ramfs_kill_sb,
 };
 static struct file_system_type rootfs_fs_type = {
        .name           = "rootfs",
-        .get_sb         = rootfs_get_sb,
+        .mount          = rootfs_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/read_write.c b/fs/read_write.c
index e757ef26e4ce..431a0ed610c8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,6 +31,20 @@ const struct file_operations generic_ro_fops = {
 EXPORT_SYMBOL(generic_ro_fops);
+static int
+__negative_fpos_check(struct file *file, loff_t pos, size_t count)
+{
+        /*
+         * pos or pos+count is negative here, check overflow.
+         * too big "count" will be caught in rw_verify_area().
+         */
+        if ((pos < 0) && (pos + count < pos))
+                return -EOVERFLOW;
+        if (file->f_mode & FMODE_UNSIGNED_OFFSET)
+                return 0;
+        return -EINVAL;
+}
 /**
 * generic_file_llseek_unlocked - lockless generic llseek implementation
 * @file:       file structure to seek on
@@ -62,7 +76,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
                break;
        }
-        if (offset < 0 || offset > inode->i_sb->s_maxbytes)
+        if (offset < 0 && __negative_fpos_check(file, offset, 0))
+                return -EINVAL;
+        if (offset > inode->i_sb->s_maxbytes)
                return -EINVAL;
        /* Special lock needed here? */
@@ -137,7 +153,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                        offset += file->f_pos;
        }
        retval = -EINVAL;
-        if (offset >= 0) {
+        if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
@@ -221,13 +237,12 @@ bad:
 }
 #endif
 /*
 * rw_verify_area doesn't like huge counts. We limit
 * them to something that fits in "int" so that others
 * won't have to do range checks all the time.
 */
-#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
 int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
 {
        struct inode *inode;
@@ -238,8 +253,11 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
        if (unlikely((ssize_t) count < 0))
                return retval;
        pos = *ppos;
-        if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
+        if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
-                return retval;
+                retval = __negative_fpos_check(file, pos, count);
+                if (retval)
+                        return retval;
+        }
        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
                retval = locks_mandatory_area(
@@ -564,65 +582,71 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
                              unsigned long nr_segs, unsigned long fast_segs,
                              struct iovec *fast_pointer,
                              struct iovec **ret_pointer)
-  {
+{
        unsigned long seg;
-        ssize_t ret;
+        ssize_t ret;
        struct iovec *iov = fast_pointer;
-        /*
+        /*
-         * SuS says "The readv() function *may* fail if the iovcnt argument
+         * SuS says "The readv() function *may* fail if the iovcnt argument
-         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
-         * traditionally returned zero for zero segments, so...
+         * traditionally returned zero for zero segments, so...
-         */
+         */
        if (nr_segs == 0) {
                ret = 0;
-                goto out;
+                goto out;
        }
-        /*
+        /*
-         * First get the "struct iovec" from user memory and
+         * First get the "struct iovec" from user memory and
-         * verify all the pointers
+         * verify all the pointers
-         */
+         */
        if (nr_segs > UIO_MAXIOV) {
                ret = -EINVAL;
-                goto out;
+                goto out;
        }
        if (nr_segs > fast_segs) {
-                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
                if (iov == NULL) {
                        ret = -ENOMEM;
-                        goto out;
+                        goto out;
                }
-        }
+        }
        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
                ret = -EFAULT;
-                goto out;
+                goto out;
        }
-        /*
+        /*
         * According to the Single Unix Specification we should return EINVAL
         * if an element length is < 0 when cast to ssize_t or if the
         * total length would overflow the ssize_t return value of the
         * system call.
-         */
+         *
+         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
+         * overflow case.
+         */
        ret = 0;
-        for (seg = 0; seg < nr_segs; seg++) {
+        for (seg = 0; seg < nr_segs; seg++) {
-                void __user *buf = iov[seg].iov_base;
+                void __user *buf = iov[seg].iov_base;
-                ssize_t len = (ssize_t)iov[seg].iov_len;
+                ssize_t len = (ssize_t)iov[seg].iov_len;
                /* see if we we're about to use an invalid len or if
                 * it's about to overflow ssize_t */
-                if (len < 0 || (ret + len < ret)) {
+                if (len < 0) {
                        ret = -EINVAL;
-                        goto out;
+                        goto out;
                }
                if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
                        ret = -EFAULT;
-                        goto out;
+                        goto out;
+                }
+                if (len > MAX_RW_COUNT - ret) {
+                        len = MAX_RW_COUNT - ret;
+                        iov[seg].iov_len = len;
                }
                ret += len;
-        }
+        }
 out:
        *ret_pointer = iov;
        return ret;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index caa758377d66..41656d40dc5c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -22,8 +22,6 @@
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-                           unsigned from, unsigned to);
 void reiserfs_evict_inode(struct inode *inode)
 {
@@ -165,7 +163,7 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
 ** but tail is still sitting in a direct item, and we can't write to
 ** it.  So, look through this page, and check all the mapped buffers
 ** to make sure they have valid block numbers.  Any that don't need
-** to be unmapped, so that block_prepare_write will correctly call
+** to be unmapped, so that __block_write_begin will correctly call
 ** reiserfs_get_block to convert the tail into an unformatted node
 */
 static inline void fix_tail_page_for_writing(struct page *page)
@@ -439,13 +437,13 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
 }
 /* special version of get_block that is only used by grab_tail_page right
-** now.  It is sent to block_prepare_write, and when you try to get a
+** now.  It is sent to __block_write_begin, and when you try to get a
 ** block past the end of the file (or a block from a hole) it returns
-** -ENOENT instead of a valid buffer.  block_prepare_write expects to
+** -ENOENT instead of a valid buffer.  __block_write_begin expects to
 ** be able to do i/o on the buffers returned, unless an error value
 ** is also returned.
 **
-** So, this allows block_prepare_write to be used for reading a single block
+** So, this allows __block_write_begin to be used for reading a single block
 ** in a page.  Where it does not produce a valid page for holes, or past the
 ** end of the file.  This turns out to be exactly what we need for reading
 ** tails for conversion.
@@ -558,11 +556,12 @@ static int convert_tail_for_hole(struct inode *inode,
         **
         ** We must fix the tail page for writing because it might have buffers
         ** that are mapped, but have a block number of 0.  This indicates tail
-         ** data that has been read directly into the page, and block_prepare_write
+         ** data that has been read directly into the page, and
-         ** won't trigger a get_block in this case.
+         ** __block_write_begin won't trigger a get_block in this case.
         */
        fix_tail_page_for_writing(tail_page);
-        retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
+        retval = __reiserfs_write_begin(tail_page, tail_start,
+                                      tail_end - tail_start);
        if (retval)
                goto unlock;
@@ -2033,7 +2032,7 @@ static int grab_tail_page(struct inode *inode,
        /* start within the page of the last block in the file */
        start = (offset / blocksize) * blocksize;
-        error = block_prepare_write(page, start, offset,
+        error = __block_write_begin(page, start, offset - start,
                                    reiserfs_get_block_create_0);
        if (error)
                goto unlock;
@@ -2438,7 +2437,7 @@ static int reiserfs_write_full_page(struct page *page,
                /* from this point on, we know the buffer is mapped to a
                 * real block and not a direct item
                 */
-                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else {
                        if (!trylock_buffer(bh)) {
@@ -2628,8 +2627,7 @@ static int reiserfs_write_begin(struct file *file,
        return ret;
 }
-int reiserfs_prepare_write(struct file *f, struct page *page,
+int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
-                           unsigned from, unsigned to)
 {
        struct inode *inode = page->mapping->host;
        int ret;
@@ -2650,7 +2648,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
                th->t_refcount++;
        }
-        ret = block_prepare_write(page, from, to, reiserfs_get_block);
+        ret = __block_write_begin(page, from, len, reiserfs_get_block);
        if (ret && reiserfs_transaction_running(inode->i_sb)) {
                struct reiserfs_transaction_handle *th = current->journal_info;
                /* this gets a little ugly.  If reiserfs_get_block returned an
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 5cbb81e134ac..adf22b485cea 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -160,8 +160,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-                           unsigned from, unsigned to);
 /*
 ** reiserfs_unpack
 ** Function try to convert tail from direct item into indirect.
@@ -200,7 +198,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
        }
        /* we unpack by finding the page with the tail, and calling
-         ** reiserfs_prepare_write on that page.  This will force a
+         ** __reiserfs_write_begin on that page.  This will force a
         ** reiserfs_get_block to unpack the tail for us.
         */
        index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -210,7 +208,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
        if (!page) {
                goto out;
        }
-        retval = reiserfs_prepare_write(NULL, page, write_from, write_from);
+        retval = __reiserfs_write_begin(page, write_from, 0);
        if (retval)
                goto out_unlock;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ee78d4a0086a..ba5f51ec3458 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1156,7 +1156,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        reiserfs_update_sd(&th, inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(dentry, inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
        reiserfs_write_unlock(dir->i_sb);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index e15ff612002d..3bf7a6457f4d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2213,12 +2213,11 @@ out:
 #endif
-static int get_super_block(struct file_system_type *fs_type,
+static struct dentry *get_super_block(struct file_system_type *fs_type,
                           int flags, const char *dev_name,
-                           void *data, struct vfsmount *mnt)
+                           void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
-                           mnt);
 }
 static int __init init_reiserfs_fs(void)
@@ -2253,7 +2252,7 @@ static void __exit exit_reiserfs_fs(void)
 struct file_system_type reiserfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "reiserfs",
-        .get_sb = get_super_block,
+        .mount = get_super_block,
        .kill_sb = reiserfs_kill_sb,
        .fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8c4cf273c672..5d04a7828e7a 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -418,13 +418,11 @@ static inline __u32 xattr_hash(const char *msg, int len)
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-                           unsigned from, unsigned to);
 static void update_ctime(struct inode *inode)
 {
        struct timespec now = current_fs_time(inode->i_sb);
-        if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink ||
+        if (inode_unhashed(inode) || !inode->i_nlink ||
            timespec_equal(&inode->i_ctime, &now))
                return;
@@ -532,8 +530,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
                        rxh->h_hash = cpu_to_le32(xahash);
                }
-                err = reiserfs_prepare_write(NULL, page, page_offset,
+                err = __reiserfs_write_begin(page, page_offset, chunk + skip);
-                                            page_offset + chunk + skip);
                if (!err) {
                        if (buffer)
                                memcpy(data + skip, buffer + buffer_pos, chunk);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 268580535c92..6647f90e55cd 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -552,20 +552,19 @@ error_rsb:
 /*
 * get a superblock for mounting
 */
-static int romfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *romfs_mount(struct file_system_type *fs_type,
                        int flags, const char *dev_name,
-                        void *data, struct vfsmount *mnt)
+                        void *data)
 {
-        int ret = -EINVAL;
+        struct dentry *ret = ERR_PTR(-EINVAL);
 #ifdef CONFIG_ROMFS_ON_MTD
-        ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
+        ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super);
-                         mnt);
 #endif
 #ifdef CONFIG_ROMFS_ON_BLOCK
-        if (ret == -EINVAL)
+        if (ret == ERR_PTR(-EINVAL))
-                ret = get_sb_bdev(fs_type, flags, dev_name, data,
+                ret = mount_bdev(fs_type, flags, dev_name, data,
-                                  romfs_fill_super, mnt);
+                                  romfs_fill_super);
 #endif
        return ret;
 }
@@ -592,7 +591,7 @@ static void romfs_kill_sb(struct super_block *sb)
 static struct file_system_type romfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "romfs",
-        .get_sb         = romfs_get_sb,
+        .mount          = romfs_mount,
        .kill_sb        = romfs_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/select.c b/fs/select.c
index 500a669f7790..b7b10aa30861 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -67,7 +67,7 @@ static long __estimate_accuracy(struct timespec *tv)
        return slack;
 }
-static long estimate_accuracy(struct timespec *tv)
+long select_estimate_accuracy(struct timespec *tv)
 {
        unsigned long ret;
        struct timespec now;
@@ -417,7 +417,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
        }
        if (end_time && !timed_out)
-                slack = estimate_accuracy(end_time);
+                slack = select_estimate_accuracy(end_time);
        retval = 0;
        for (;;) {
@@ -769,7 +769,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
        }
        if (end_time && !timed_out)
-                slack = estimate_accuracy(end_time);
+                slack = select_estimate_accuracy(end_time);
        for (;;) {
                struct poll_list *walk;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 0e7cb1395a94..05d6b0e78c95 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -462,9 +462,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
        if (size) {
                char *p;
-                spin_lock(&dcache_lock);
                p = __d_path(path, root, buf, size);
-                spin_unlock(&dcache_lock);
                res = PTR_ERR(p);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 74047304b01a..492465b451dd 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -99,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 #ifdef __ARCH_SI_TRAPNO
                err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
 #endif
+#ifdef BUS_MCEERR_AO
+                /* 
+                 * Other callers might not initialize the si_lsb field,
+                 * so check explicitly for the right codes here.
+                 */
+                if (kinfo->si_code == BUS_MCEERR_AR ||
+                    kinfo->si_code == BUS_MCEERR_AO)
+                        err |= __put_user((short) kinfo->si_addr_lsb,
+                                          &uinfo->ssi_addr_lsb);
+#endif
                break;
        case __SI_CHLD:
                err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
deleted file mode 100644
index 2bc24a8c4039..000000000000
--- a/fs/smbfs/Kconfig
+++ /dev/null
@@ -1,56 +0,0 @@
-config SMB_FS
-        tristate "SMB file system support (OBSOLETE, please use CIFS)"
-        depends on BKL # probably unfixable
-        depends on INET
-        select NLS
-        help
-          SMB (Server Message Block) is the protocol Windows for Workgroups
-          (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
-          files and printers over local networks.  Saying Y here allows you to
-          mount their file systems (often called "shares" in this context) and
-          access them just like any other Unix directory.  Currently, this
-          works only if the Windows machines use TCP/IP as the underlying
-          transport protocol, and not NetBEUI.  For details, read
-          <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
-          available from <http://www.tldp.org/docs.html#howto>.
-          Note: if you just want your box to act as an SMB *server* and make
-          files and printing services available to Windows clients (which need
-          to have a TCP/IP stack), you don't need to say Y here; you can use
-          the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
-          for that.
-          General information about how to connect Linux, Windows machines and
-          Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-          To compile the SMB support as a module, choose M here:
-          the module will be called smbfs.  Most people say N, however.
-config SMB_NLS_DEFAULT
-        bool "Use a default NLS"
-        depends on SMB_FS
-        help
-          Enabling this will make smbfs use nls translations by default. You
-          need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
-          settings and you need to give the default nls for the SMB server as
-          CONFIG_SMB_NLS_REMOTE.
-          The nls settings can be changed at mount time, if your smbmount
-          supports that, using the codepage and iocharset parameters.
-          smbmount from samba 2.2.0 or later supports this.
-config SMB_NLS_REMOTE
-        string "Default Remote NLS Option"
-        depends on SMB_NLS_DEFAULT
-        default "cp437"
-        help
-          This setting allows you to specify a default value for which
-          codepage the server uses. If this field is left blank no
-          translations will be done by default. The local codepage/charset
-          default to CONFIG_NLS_DEFAULT.
-          The nls settings can be changed at mount time, if your smbmount
-          supports that, using the codepage and iocharset parameters.
-          smbmount from samba 2.2.0 or later supports this.
diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
deleted file mode 100644
index 4faf8c4722c3..000000000000
--- a/fs/smbfs/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-#
-# Makefile for the linux smb-filesystem routines.
-#
-obj-$(CONFIG_SMB_FS) += smbfs.o
-smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \
-                symlink.o smbiod.o request.o
-# If you want debugging output, you may add these flags to the EXTRA_CFLAGS
-# SMBFS_PARANOIA should normally be enabled.
-EXTRA_CFLAGS += -DSMBFS_PARANOIA
-#EXTRA_CFLAGS += -DSMBFS_DEBUG
-#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE
-#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
-#EXTRA_CFLAGS += -Werror
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
deleted file mode 100644
index 8c177eb7e344..000000000000
--- a/fs/smbfs/cache.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- *  cache.c
- *
- * Copyright (C) 1997 by Bill Hawes
- *
- * Routines to support directory cacheing using the page cache.
- * This cache code is almost directly taken from ncpfs.
- *
- * Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/smb_fs.h>
-#include <linux/pagemap.h>
-#include <linux/net.h>
-#include <asm/page.h>
-#include "smb_debug.h"
-#include "proto.h"
-/*
- * Force the next attempt to use the cache to be a timeout.
- * If we can't find the page that's fine, it will cause a refresh.
- */
-void
-smb_invalid_dir_cache(struct inode * dir)
-{
-        struct smb_sb_info *server = server_from_inode(dir);
-        union  smb_dir_cache *cache = NULL;
-        struct page *page = NULL;
-        page = grab_cache_page(&dir->i_data, 0);
-        if (!page)
-                goto out;
-        if (!PageUptodate(page))
-                goto out_unlock;
-        cache = kmap(page);
-        cache->head.time = jiffies - SMB_MAX_AGE(server);
-        kunmap(page);
-        SetPageUptodate(page);
-out_unlock:
-        unlock_page(page);
-        page_cache_release(page);
-out:
-        return;
-}
-/*
- * Mark all dentries for 'parent' as invalid, forcing them to be re-read
- */
-void
-smb_invalidate_dircache_entries(struct dentry *parent)
-{
-        struct smb_sb_info *server = server_from_dentry(parent);
-        struct list_head *next;
-        struct dentry *dentry;
-        spin_lock(&dcache_lock);
-        next = parent->d_subdirs.next;
-        while (next != &parent->d_subdirs) {
-                dentry = list_entry(next, struct dentry, d_u.d_child);
-                dentry->d_fsdata = NULL;
-                smb_age_dentry(server, dentry);
-                next = next->next;
-        }
-        spin_unlock(&dcache_lock);
-}
-/*
- * dget, but require that fpos and parent matches what the dentry contains.
- * dentry is not known to be a valid pointer at entry.
- */
-struct dentry *
-smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
-{
-        struct dentry *dent = dentry;
-        struct list_head *next;
-        if (d_validate(dent, parent)) {
-                if (dent->d_name.len <= SMB_MAXNAMELEN &&
-                    (unsigned long)dent->d_fsdata == fpos) {
-                        if (!dent->d_inode) {
-                                dput(dent);
-                                dent = NULL;
-                        }
-                        return dent;
-                }
-                dput(dent);
-        }
-        /* If a pointer is invalid, we search the dentry. */
-        spin_lock(&dcache_lock);
-        next = parent->d_subdirs.next;
-        while (next != &parent->d_subdirs) {
-                dent = list_entry(next, struct dentry, d_u.d_child);
-                if ((unsigned long)dent->d_fsdata == fpos) {
-                        if (dent->d_inode)
-                                dget_locked(dent);
-                        else
-                                dent = NULL;
-                        goto out_unlock;
-                }
-                next = next->next;
-        }
-        dent = NULL;
-out_unlock:
-        spin_unlock(&dcache_lock);
-        return dent;
-}
-/*
- * Create dentry/inode for this file and add it to the dircache.
- */
-int
-smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-               struct smb_cache_control *ctrl, struct qstr *qname,
-               struct smb_fattr *entry)
-{
-        struct dentry *newdent, *dentry = filp->f_path.dentry;
-        struct inode *newino, *inode = dentry->d_inode;
-        struct smb_cache_control ctl = *ctrl;
-        int valid = 0;
-        int hashed = 0;
-        ino_t ino = 0;
-        qname->hash = full_name_hash(qname->name, qname->len);
-        if (dentry->d_op && dentry->d_op->d_hash)
-                if (dentry->d_op->d_hash(dentry, qname) != 0)
-                        goto end_advance;
-        newdent = d_lookup(dentry, qname);
-        if (!newdent) {
-                newdent = d_alloc(dentry, qname);
-                if (!newdent)
-                        goto end_advance;
-        } else {
-                hashed = 1;
-                memcpy((char *) newdent->d_name.name, qname->name,
-                       newdent->d_name.len);
-        }
-        if (!newdent->d_inode) {
-                smb_renew_times(newdent);
-                entry->f_ino = iunique(inode->i_sb, 2);
-                newino = smb_iget(inode->i_sb, entry);
-                if (newino) {
-                        smb_new_dentry(newdent);
-                        d_instantiate(newdent, newino);
-                        if (!hashed)
-                                d_rehash(newdent);
-                }
-        } else
-                smb_set_inode_attr(newdent->d_inode, entry);
-        if (newdent->d_inode) {
-                ino = newdent->d_inode->i_ino;
-                newdent->d_fsdata = (void *) ctl.fpos;
-                smb_new_dentry(newdent);
-        }
-        if (ctl.idx >= SMB_DIRCACHE_SIZE) {
-                if (ctl.page) {
-                        kunmap(ctl.page);
-                        SetPageUptodate(ctl.page);
-                        unlock_page(ctl.page);
-                        page_cache_release(ctl.page);
-                }
-                ctl.cache = NULL;
-                ctl.idx  -= SMB_DIRCACHE_SIZE;
-                ctl.ofs  += 1;
-                ctl.page  = grab_cache_page(&inode->i_data, ctl.ofs);
-                if (ctl.page)
-                        ctl.cache = kmap(ctl.page);
-        }
-        if (ctl.cache) {
-                ctl.cache->dentry[ctl.idx] = newdent;
-                valid = 1;
-        }
-        dput(newdent);
-end_advance:
-        if (!valid)
-                ctl.valid = 0;
-        if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
-                if (!ino)
-                        ino = find_inode_number(dentry, qname);
-                if (!ino)
-                        ino = iunique(inode->i_sb, 2);
-                ctl.filled = filldir(dirent, qname->name, qname->len,
-                                     filp->f_pos, ino, DT_UNKNOWN);
-                if (!ctl.filled)
-                        filp->f_pos += 1;
-        }
-        ctl.fpos += 1;
-        ctl.idx  += 1;
-        *ctrl = ctl;
-        return (ctl.valid || !ctl.filled);
-}
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
deleted file mode 100644
index 00a70cab1f36..000000000000
--- a/fs/smbfs/dir.c
+++ /dev/null
@@ -1,702 +0,0 @@
-/*
- *  dir.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/smp_lock.h>
-#include <linux/ctype.h>
-#include <linux/net.h>
-#include <linux/sched.h>
-#include <linux/smb_fs.h>
-#include <linux/smb_mount.h>
-#include <linux/smbno.h>
-#include "smb_debug.h"
-#include "proto.h"
-static int smb_readdir(struct file *, void *, filldir_t);
-static int smb_dir_open(struct inode *, struct file *);
-static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *);
-static int smb_create(struct inode *, struct dentry *, int, struct nameidata *);
-static int smb_mkdir(struct inode *, struct dentry *, int);
-static int smb_rmdir(struct inode *, struct dentry *);
-static int smb_unlink(struct inode *, struct dentry *);
-static int smb_rename(struct inode *, struct dentry *,
-                      struct inode *, struct dentry *);
-static int smb_make_node(struct inode *,struct dentry *,int,dev_t);
-static int smb_link(struct dentry *, struct inode *, struct dentry *);
-const struct file_operations smb_dir_operations =
-{
-        .llseek         = generic_file_llseek,
-        .read           = generic_read_dir,
-        .readdir        = smb_readdir,
-        .unlocked_ioctl = smb_ioctl,
-        .open           = smb_dir_open,
-};
-const struct inode_operations smb_dir_inode_operations =
-{
-        .create         = smb_create,
-        .lookup         = smb_lookup,
-        .unlink         = smb_unlink,
-        .mkdir          = smb_mkdir,
-        .rmdir          = smb_rmdir,
-        .rename         = smb_rename,
-        .getattr        = smb_getattr,
-        .setattr        = smb_notify_change,
-};
-const struct inode_operations smb_dir_inode_operations_unix =
-{
-        .create         = smb_create,
-        .lookup         = smb_lookup,
-        .unlink         = smb_unlink,
-        .mkdir          = smb_mkdir,
-        .rmdir          = smb_rmdir,
-        .rename         = smb_rename,
-        .getattr        = smb_getattr,
-        .setattr        = smb_notify_change,
-        .symlink        = smb_symlink,
-        .mknod          = smb_make_node,
-        .link           = smb_link,
-};
-/*
- * Read a directory, using filldir to fill the dirent memory.
- * smb_proc_readdir does the actual reading from the smb server.
- *
- * The cache code is almost directly taken from ncpfs
- */
-static int 
-smb_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-        struct dentry *dentry = filp->f_path.dentry;
-        struct inode *dir = dentry->d_inode;
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        union  smb_dir_cache *cache = NULL;
-        struct smb_cache_control ctl;
-        struct page *page = NULL;
-        int result;
-        ctl.page  = NULL;
-        ctl.cache = NULL;
-        VERBOSE("reading %s/%s, f_pos=%d\n",
-                DENTRY_PATH(dentry),  (int) filp->f_pos);
-        result = 0;
-        lock_kernel();
-        switch ((unsigned int) filp->f_pos) {
-        case 0:
-                if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos = 1;
-                /* fallthrough */
-        case 1:
-                if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR) < 0)
-                        goto out;
-                filp->f_pos = 2;
-        }
-        /*
-         * Make sure our inode is up-to-date.
-         */
-        result = smb_revalidate_inode(dentry);
-        if (result)
-                goto out;
-        page = grab_cache_page(&dir->i_data, 0);
-        if (!page)
-                goto read_really;
-        ctl.cache = cache = kmap(page);
-        ctl.head  = cache->head;
-        if (!PageUptodate(page) || !ctl.head.eof) {
-                VERBOSE("%s/%s, page uptodate=%d, eof=%d\n",
-                         DENTRY_PATH(dentry), PageUptodate(page),ctl.head.eof);
-                goto init_cache;
-        }
-        if (filp->f_pos == 2) {
-                if (jiffies - ctl.head.time >= SMB_MAX_AGE(server))
-                        goto init_cache;
-                /*
-                 * N.B. ncpfs checks mtime of dentry too here, we don't.
-                 *   1. common smb servers do not update mtime on dir changes
-                 *   2. it requires an extra smb request
-                 *      (revalidate has the same timeout as ctl.head.time)
-                 *
-                 * Instead smbfs invalidates its own cache on local changes
-                 * and remote changes are not seen until timeout.
-                 */
-        }
-        if (filp->f_pos > ctl.head.end)
-                goto finished;
-        ctl.fpos = filp->f_pos + (SMB_DIRCACHE_START - 2);
-        ctl.ofs  = ctl.fpos / SMB_DIRCACHE_SIZE;
-        ctl.idx  = ctl.fpos % SMB_DIRCACHE_SIZE;
-        for (;;) {
-                if (ctl.ofs != 0) {
-                        ctl.page = find_lock_page(&dir->i_data, ctl.ofs);
-                        if (!ctl.page)
-                                goto invalid_cache;
-                        ctl.cache = kmap(ctl.page);
-                        if (!PageUptodate(ctl.page))
-                                goto invalid_cache;
-                }
-                while (ctl.idx < SMB_DIRCACHE_SIZE) {
-                        struct dentry *dent;
-                        int res;
-                        dent = smb_dget_fpos(ctl.cache->dentry[ctl.idx],
-                                             dentry, filp->f_pos);
-                        if (!dent)
-                                goto invalid_cache;
-                        res = filldir(dirent, dent->d_name.name,
-                                      dent->d_name.len, filp->f_pos,
-                                      dent->d_inode->i_ino, DT_UNKNOWN);
-                        dput(dent);
-                        if (res)
-                                goto finished;
-                        filp->f_pos += 1;
-                        ctl.idx += 1;
-                        if (filp->f_pos > ctl.head.end)
-                                goto finished;
-                }
-                if (ctl.page) {
-                        kunmap(ctl.page);
-                        SetPageUptodate(ctl.page);
-                        unlock_page(ctl.page);
-                        page_cache_release(ctl.page);
-                        ctl.page = NULL;
-                }
-                ctl.idx  = 0;
-                ctl.ofs += 1;
-        }
-invalid_cache:
-        if (ctl.page) {
-                kunmap(ctl.page);
-                unlock_page(ctl.page);
-                page_cache_release(ctl.page);
-                ctl.page = NULL;
-        }
-        ctl.cache = cache;
-init_cache:
-        smb_invalidate_dircache_entries(dentry);
-        ctl.head.time = jiffies;
-        ctl.head.eof = 0;
-        ctl.fpos = 2;
-        ctl.ofs = 0;
-        ctl.idx = SMB_DIRCACHE_START;
-        ctl.filled = 0;
-        ctl.valid  = 1;
-read_really:
-        result = server->ops->readdir(filp, dirent, filldir, &ctl);
-        if (result == -ERESTARTSYS && page)
-                ClearPageUptodate(page);
-        if (ctl.idx == -1)
-                goto invalid_cache;     /* retry */
-        ctl.head.end = ctl.fpos - 1;
-        ctl.head.eof = ctl.valid;
-finished:
-        if (page) {
-                cache->head = ctl.head;
-                kunmap(page);
-                if (result != -ERESTARTSYS)
-                        SetPageUptodate(page);
-                unlock_page(page);
-                page_cache_release(page);
-        }
-        if (ctl.page) {
-                kunmap(ctl.page);
-                SetPageUptodate(ctl.page);
-                unlock_page(ctl.page);
-                page_cache_release(ctl.page);
-        }
-out:
-        unlock_kernel();
-        return result;
-}
-static int
-smb_dir_open(struct inode *dir, struct file *file)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        struct smb_sb_info *server;
-        int error = 0;
-        VERBOSE("(%s/%s)\n", dentry->d_parent->d_name.name,
-                file->f_path.dentry->d_name.name);
-        /*
-         * Directory timestamps in the core protocol aren't updated
-         * when a file is added, so we give them a very short TTL.
-         */
-        lock_kernel();
-        server = server_from_dentry(dentry);
-        if (server->opt.protocol < SMB_PROTOCOL_LANMAN2) {
-                unsigned long age = jiffies - SMB_I(dir)->oldmtime;
-                if (age > 2*HZ)
-                        smb_invalid_dir_cache(dir);
-        }
-        /*
-         * Note: in order to allow the smbmount process to open the
-         * mount point, we only revalidate if the connection is valid or
-         * if the process is trying to access something other than the root.
-         */
-        if (server->state == CONN_VALID || !IS_ROOT(dentry))
-                error = smb_revalidate_inode(dentry);
-        unlock_kernel();
-        return error;
-}
-/*
- * Dentry operations routines
- */
-static int smb_lookup_validate(struct dentry *, struct nameidata *);
-static int smb_hash_dentry(struct dentry *, struct qstr *);
-static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
-static int smb_delete_dentry(struct dentry *);
-static const struct dentry_operations smbfs_dentry_operations =
-{
-        .d_revalidate   = smb_lookup_validate,
-        .d_hash         = smb_hash_dentry,
-        .d_compare      = smb_compare_dentry,
-        .d_delete       = smb_delete_dentry,
-};
-static const struct dentry_operations smbfs_dentry_operations_case =
-{
-        .d_revalidate   = smb_lookup_validate,
-        .d_delete       = smb_delete_dentry,
-};
-/*
- * This is the callback when the dcache has a lookup hit.
- */
-static int
-smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        struct inode * inode = dentry->d_inode;
-        unsigned long age = jiffies - dentry->d_time;
-        int valid;
-        /*
-         * The default validation is based on dentry age:
-         * we believe in dentries for a few seconds.  (But each
-         * successful server lookup renews the timestamp.)
-         */
-        valid = (age <= SMB_MAX_AGE(server));
-#ifdef SMBFS_DEBUG_VERBOSE
-        if (!valid)
-                VERBOSE("%s/%s not valid, age=%lu\n", 
-                        DENTRY_PATH(dentry), age);
-#endif
-        if (inode) {
-                lock_kernel();
-                if (is_bad_inode(inode)) {
-                        PARANOIA("%s/%s has dud inode\n", DENTRY_PATH(dentry));
-                        valid = 0;
-                } else if (!valid)
-                        valid = (smb_revalidate_inode(dentry) == 0);
-                unlock_kernel();
-        } else {
-                /*
-                 * What should we do for negative dentries?
-                 */
-        }
-        return valid;
-}
-static int 
-smb_hash_dentry(struct dentry *dir, struct qstr *this)
-{
-        unsigned long hash;
-        int i;
-        hash = init_name_hash();
-        for (i=0; i < this->len ; i++)
-                hash = partial_name_hash(tolower(this->name[i]), hash);
-        this->hash = end_name_hash(hash);
-  
-        return 0;
-}
-static int
-smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b)
-{
-        int i, result = 1;
-        if (a->len != b->len)
-                goto out;
-        for (i=0; i < a->len; i++) {
-                if (tolower(a->name[i]) != tolower(b->name[i]))
-                        goto out;
-        }
-        result = 0;
-out:
-        return result;
-}
-/*
- * This is the callback from dput() when d_count is going to 0.
- * We use this to unhash dentries with bad inodes.
- */
-static int
-smb_delete_dentry(struct dentry * dentry)
-{
-        if (dentry->d_inode) {
-                if (is_bad_inode(dentry->d_inode)) {
-                        PARANOIA("bad inode, unhashing %s/%s\n",
-                                 DENTRY_PATH(dentry));
-                        return 1;
-                }
-        } else {
-                /* N.B. Unhash negative dentries? */
-        }
-        return 0;
-}
-/*
- * Initialize a new dentry
- */
-void
-smb_new_dentry(struct dentry *dentry)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        if (server->mnt->flags & SMB_MOUNT_CASE)
-                dentry->d_op = &smbfs_dentry_operations_case;
-        else
-                dentry->d_op = &smbfs_dentry_operations;
-        dentry->d_time = jiffies;
-}
-/*
- * Whenever a lookup succeeds, we know the parent directories
- * are all valid, so we want to update the dentry timestamps.
- * N.B. Move this to dcache?
- */
-void
-smb_renew_times(struct dentry * dentry)
-{
-        dget(dentry);
-        spin_lock(&dentry->d_lock);
-        for (;;) {
-                struct dentry *parent;
-                dentry->d_time = jiffies;
-                if (IS_ROOT(dentry))
-                        break;
-                parent = dentry->d_parent;
-                dget(parent);
-                spin_unlock(&dentry->d_lock);
-                dput(dentry);
-                dentry = parent;
-                spin_lock(&dentry->d_lock);
-        }
-        spin_unlock(&dentry->d_lock);
-        dput(dentry);
-}
-static struct dentry *
-smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        struct smb_fattr finfo;
-        struct inode *inode;
-        int error;
-        struct smb_sb_info *server;
-        error = -ENAMETOOLONG;
-        if (dentry->d_name.len > SMB_MAXNAMELEN)
-                goto out;
-        /* Do not allow lookup of names with backslashes in */
-        error = -EINVAL;
-        if (memchr(dentry->d_name.name, '\\', dentry->d_name.len))
-                goto out;
-        lock_kernel();
-        error = smb_proc_getattr(dentry, &finfo);
-#ifdef SMBFS_PARANOIA
-        if (error && error != -ENOENT)
-                PARANOIA("find %s/%s failed, error=%d\n",
-                         DENTRY_PATH(dentry), error);
-#endif
-        inode = NULL;
-        if (error == -ENOENT)
-                goto add_entry;
-        if (!error) {
-                error = -EACCES;
-                finfo.f_ino = iunique(dentry->d_sb, 2);
-                inode = smb_iget(dir->i_sb, &finfo);
-                if (inode) {
-        add_entry:
-                        server = server_from_dentry(dentry);
-                        if (server->mnt->flags & SMB_MOUNT_CASE)
-                                dentry->d_op = &smbfs_dentry_operations_case;
-                        else
-                                dentry->d_op = &smbfs_dentry_operations;
-                        d_add(dentry, inode);
-                        smb_renew_times(dentry);
-                        error = 0;
-                }
-        }
-        unlock_kernel();
-out:
-        return ERR_PTR(error);
-}
-/*
- * This code is common to all routines creating a new inode.
- */
-static int
-smb_instantiate(struct dentry *dentry, __u16 fileid, int have_id)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        struct inode *inode;
-        int error;
-        struct smb_fattr fattr;
-        VERBOSE("file %s/%s, fileid=%u\n", DENTRY_PATH(dentry), fileid);
-        error = smb_proc_getattr(dentry, &fattr);
-        if (error)
-                goto out_close;
-        smb_renew_times(dentry);
-        fattr.f_ino = iunique(dentry->d_sb, 2);
-        inode = smb_iget(dentry->d_sb, &fattr);
-        if (!inode)
-                goto out_no_inode;
-        if (have_id) {
-                struct smb_inode_info *ei = SMB_I(inode);
-                ei->fileid = fileid;
-                ei->access = SMB_O_RDWR;
-                ei->open = server->generation;
-        }
-        d_instantiate(dentry, inode);
-out:
-        return error;
-out_no_inode:
-        error = -EACCES;
-out_close:
-        if (have_id) {
-                PARANOIA("%s/%s failed, error=%d, closing %u\n",
-                         DENTRY_PATH(dentry), error, fileid);
-                smb_close_fileid(dentry, fileid);
-        }
-        goto out;
-}
-/* N.B. How should the mode argument be used? */
-static int
-smb_create(struct inode *dir, struct dentry *dentry, int mode,
-                struct nameidata *nd)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        __u16 fileid;
-        int error;
-        struct iattr attr;
-        VERBOSE("creating %s/%s, mode=%d\n", DENTRY_PATH(dentry), mode);
-        lock_kernel();
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_create(dentry, 0, get_seconds(), &fileid);
-        if (!error) {
-                if (server->opt.capabilities & SMB_CAP_UNIX) {
-                        /* Set attributes for new file */
-                        attr.ia_valid = ATTR_MODE;
-                        attr.ia_mode = mode;
-                        error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
-                }
-                error = smb_instantiate(dentry, fileid, 1);
-        } else {
-                PARANOIA("%s/%s failed, error=%d\n",
-                         DENTRY_PATH(dentry), error);
-        }
-        unlock_kernel();
-        return error;
-}
-/* N.B. How should the mode argument be used? */
-static int
-smb_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        int error;
-        struct iattr attr;
-        lock_kernel();
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_mkdir(dentry);
-        if (!error) {
-                if (server->opt.capabilities & SMB_CAP_UNIX) {
-                        /* Set attributes for new directory */
-                        attr.ia_valid = ATTR_MODE;
-                        attr.ia_mode = mode;
-                        error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
-                }
-                error = smb_instantiate(dentry, 0, 0);
-        }
-        unlock_kernel();
-        return error;
-}
-static int
-smb_rmdir(struct inode *dir, struct dentry *dentry)
-{
-        struct inode *inode = dentry->d_inode;
-        int error;
-        /*
-         * Close the directory if it's open.
-         */
-        lock_kernel();
-        smb_close(inode);
-        /*
-         * Check that nobody else is using the directory..
-         */
-        error = -EBUSY;
-        if (!d_unhashed(dentry))
-                goto out;
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_rmdir(dentry);
-out:
-        unlock_kernel();
-        return error;
-}
-static int
-smb_unlink(struct inode *dir, struct dentry *dentry)
-{
-        int error;
-        /*
-         * Close the file if it's open.
-         */
-        lock_kernel();
-        smb_close(dentry->d_inode);
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_unlink(dentry);
-        if (!error)
-                smb_renew_times(dentry);
-        unlock_kernel();
-        return error;
-}
-static int
-smb_rename(struct inode *old_dir, struct dentry *old_dentry,
-           struct inode *new_dir, struct dentry *new_dentry)
-{
-        int error;
-        /*
-         * Close any open files, and check whether to delete the
-         * target before attempting the rename.
-         */
-        lock_kernel();
-        if (old_dentry->d_inode)
-                smb_close(old_dentry->d_inode);
-        if (new_dentry->d_inode) {
-                smb_close(new_dentry->d_inode);
-                error = smb_proc_unlink(new_dentry);
-                if (error) {
-                        VERBOSE("unlink %s/%s, error=%d\n",
-                                DENTRY_PATH(new_dentry), error);
-                        goto out;
-                }
-                /* FIXME */
-                d_delete(new_dentry);
-        }
-        smb_invalid_dir_cache(old_dir);
-        smb_invalid_dir_cache(new_dir);
-        error = smb_proc_mv(old_dentry, new_dentry);
-        if (!error) {
-                smb_renew_times(old_dentry);
-                smb_renew_times(new_dentry);
-        }
-out:
-        unlock_kernel();
-        return error;
-}
-/*
- * FIXME: samba servers won't let you create device nodes unless uid/gid
- * matches the connection credentials (and we don't know which those are ...)
- */
-static int
-smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
-{
-        int error;
-        struct iattr attr;
-        attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
-        attr.ia_mode = mode;
-        current_euid_egid(&attr.ia_uid, &attr.ia_gid);
-        if (!new_valid_dev(dev))
-                return -EINVAL;
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_setattr_unix(dentry, &attr, MAJOR(dev), MINOR(dev));
-        if (!error) {
-                error = smb_instantiate(dentry, 0, 0);
-        }
-        return error;
-}
-/*
- * dentry = existing file
- * new_dentry = new file
- */
-static int
-smb_link(struct dentry *dentry, struct inode *dir, struct dentry *new_dentry)
-{
-        int error;
-        DEBUG1("smb_link old=%s/%s new=%s/%s\n",
-               DENTRY_PATH(dentry), DENTRY_PATH(new_dentry));
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_link(server_from_dentry(dentry), dentry, new_dentry);
-        if (!error) {
-                smb_renew_times(dentry);
-                error = smb_instantiate(new_dentry, 0, 0);
-        }
-        return error;
-}
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
deleted file mode 100644
index 8e187a0f94bb..000000000000
--- a/fs/smbfs/file.c
+++ /dev/null
@@ -1,454 +0,0 @@
-/*
- *  file.c
- *
- *  Copyright (C) 1995, 1996, 1997 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/aio.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/smbno.h>
-#include <linux/smb_fs.h>
-#include "smb_debug.h"
-#include "proto.h"
-static int
-smb_fsync(struct file *file, int datasync)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        int result;
-        VERBOSE("sync file %s/%s\n", DENTRY_PATH(dentry));
-        /*
-         * The VFS will writepage() all dirty pages for us, but we
-         * should send a SMBflush to the server, letting it know that
-         * we want things synchronized with actual storage.
-         *
-         * Note: this function requires all pages to have been written already
-         *       (should be ok with writepage_sync)
-         */
-        result = smb_proc_flush(server, SMB_I(dentry->d_inode)->fileid);
-        return result;
-}
-/*
- * Read a page synchronously.
- */
-static int
-smb_readpage_sync(struct dentry *dentry, struct page *page)
-{
-        char *buffer = kmap(page);
-        loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        unsigned int rsize = smb_get_rsize(server);
-        int count = PAGE_SIZE;
-        int result;
-        VERBOSE("file %s/%s, count=%d@%Ld, rsize=%d\n",
-                DENTRY_PATH(dentry), count, offset, rsize);
-        result = smb_open(dentry, SMB_O_RDONLY);
-        if (result < 0)
-                goto io_error;
-        do {
-                if (count < rsize)
-                        rsize = count;
-                result = server->ops->read(dentry->d_inode,offset,rsize,buffer);
-                if (result < 0)
-                        goto io_error;
-                count -= result;
-                offset += result;
-                buffer += result;
-                dentry->d_inode->i_atime =
-                        current_fs_time(dentry->d_inode->i_sb);
-                if (result < rsize)
-                        break;
-        } while (count);
-        memset(buffer, 0, count);
-        flush_dcache_page(page);
-        SetPageUptodate(page);
-        result = 0;
-io_error:
-        kunmap(page);
-        unlock_page(page);
-        return result;
-}
-/*
- * We are called with the page locked and we unlock it when done.
- */
-static int
-smb_readpage(struct file *file, struct page *page)
-{
-        int             error;
-        struct dentry  *dentry = file->f_path.dentry;
-        page_cache_get(page);
-        error = smb_readpage_sync(dentry, page);
-        page_cache_release(page);
-        return error;
-}
-/*
- * Write a page synchronously.
- * Offset is the data offset within the page.
- */
-static int
-smb_writepage_sync(struct inode *inode, struct page *page,
-                   unsigned long pageoffset, unsigned int count)
-{
-        loff_t offset;
-        char *buffer = kmap(page) + pageoffset;
-        struct smb_sb_info *server = server_from_inode(inode);
-        unsigned int wsize = smb_get_wsize(server);
-        int ret = 0;
-        offset = ((loff_t)page->index << PAGE_CACHE_SHIFT) + pageoffset;
-        VERBOSE("file ino=%ld, fileid=%d, count=%d@%Ld, wsize=%d\n",
-                inode->i_ino, SMB_I(inode)->fileid, count, offset, wsize);
-        do {
-                int write_ret;
-                if (count < wsize)
-                        wsize = count;
-                write_ret = server->ops->write(inode, offset, wsize, buffer);
-                if (write_ret < 0) {
-                        PARANOIA("failed write, wsize=%d, write_ret=%d\n",
-                                 wsize, write_ret);
-                        ret = write_ret;
-                        break;
-                }
-                /* N.B. what if result < wsize?? */
-#ifdef SMBFS_PARANOIA
-                if (write_ret < wsize)
-                        PARANOIA("short write, wsize=%d, write_ret=%d\n",
-                                 wsize, write_ret);
-#endif
-                buffer += wsize;
-                offset += wsize;
-                count -= wsize;
-                /*
-                 * Update the inode now rather than waiting for a refresh.
-                 */
-                inode->i_mtime = inode->i_atime = current_fs_time(inode->i_sb);
-                SMB_I(inode)->flags |= SMB_F_LOCALWRITE;
-                if (offset > inode->i_size)
-                        inode->i_size = offset;
-        } while (count);
-        kunmap(page);
-        return ret;
-}
-/*
- * Write a page to the server. This will be used for NFS swapping only
- * (for now), and we currently do this synchronously only.
- *
- * We are called with the page locked and we unlock it when done.
- */
-static int
-smb_writepage(struct page *page, struct writeback_control *wbc)
-{
-        struct address_space *mapping = page->mapping;
-        struct inode *inode;
-        unsigned long end_index;
-        unsigned offset = PAGE_CACHE_SIZE;
-        int err;
-        BUG_ON(!mapping);
-        inode = mapping->host;
-        BUG_ON(!inode);
-        end_index = inode->i_size >> PAGE_CACHE_SHIFT;
-        /* easy case */
-        if (page->index < end_index)
-                goto do_it;
-        /* things got complicated... */
-        offset = inode->i_size & (PAGE_CACHE_SIZE-1);
-        /* OK, are we completely out? */
-        if (page->index >= end_index+1 || !offset)
-                return 0; /* truncated - don't care */
-do_it:
-        page_cache_get(page);
-        err = smb_writepage_sync(inode, page, 0, offset);
-        SetPageUptodate(page);
-        unlock_page(page);
-        page_cache_release(page);
-        return err;
-}
-static int
-smb_updatepage(struct file *file, struct page *page, unsigned long offset,
-               unsigned int count)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
-                ((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
-        return smb_writepage_sync(dentry->d_inode, page, offset, count);
-}
-static ssize_t
-smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-                        unsigned long nr_segs, loff_t pos)
-{
-        struct file * file = iocb->ki_filp;
-        struct dentry * dentry = file->f_path.dentry;
-        ssize_t status;
-        VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
-                (unsigned long) iocb->ki_left, (unsigned long) pos);
-        status = smb_revalidate_inode(dentry);
-        if (status) {
-                PARANOIA("%s/%s validation failed, error=%Zd\n",
-                         DENTRY_PATH(dentry), status);
-                goto out;
-        }
-        VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n",
-                (long)dentry->d_inode->i_size,
-                dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec);
-        status = generic_file_aio_read(iocb, iov, nr_segs, pos);
-out:
-        return status;
-}
-static int
-smb_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
-        struct dentry * dentry = file->f_path.dentry;
-        int     status;
-        VERBOSE("file %s/%s, address %lu - %lu\n",
-                DENTRY_PATH(dentry), vma->vm_start, vma->vm_end);
-        status = smb_revalidate_inode(dentry);
-        if (status) {
-                PARANOIA("%s/%s validation failed, error=%d\n",
-                         DENTRY_PATH(dentry), status);
-                goto out;
-        }
-        status = generic_file_mmap(file, vma);
-out:
-        return status;
-}
-static ssize_t
-smb_file_splice_read(struct file *file, loff_t *ppos,
-                     struct pipe_inode_info *pipe, size_t count,
-                     unsigned int flags)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        ssize_t status;
-        VERBOSE("file %s/%s, pos=%Ld, count=%lu\n",
-                DENTRY_PATH(dentry), *ppos, count);
-        status = smb_revalidate_inode(dentry);
-        if (status) {
-                PARANOIA("%s/%s validation failed, error=%Zd\n",
-                         DENTRY_PATH(dentry), status);
-                goto out;
-        }
-        status = generic_file_splice_read(file, ppos, pipe, count, flags);
-out:
-        return status;
-}
-/*
- * This does the "real" work of the write. The generic routine has
- * allocated the page, locked it, done all the page alignment stuff
- * calculations etc. Now we should just copy the data from user
- * space and write it back to the real medium..
- *
- * If the writer ends up delaying the write, the writer needs to
- * increment the page use counts until he is done with the page.
- */
-static int smb_write_begin(struct file *file, struct address_space *mapping,
-                        loff_t pos, unsigned len, unsigned flags,
-                        struct page **pagep, void **fsdata)
-{
-        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = grab_cache_page_write_begin(mapping, index, flags);
-        if (!*pagep)
-                return -ENOMEM;
-        return 0;
-}
-static int smb_write_end(struct file *file, struct address_space *mapping,
-                        loff_t pos, unsigned len, unsigned copied,
-                        struct page *page, void *fsdata)
-{
-        int status;
-        unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-        lock_kernel();
-        status = smb_updatepage(file, page, offset, copied);
-        unlock_kernel();
-        if (!status) {
-                if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
-                        SetPageUptodate(page);
-                status = copied;
-        }
-        unlock_page(page);
-        page_cache_release(page);
-        return status;
-}
-const struct address_space_operations smb_file_aops = {
-        .readpage = smb_readpage,
-        .writepage = smb_writepage,
-        .write_begin = smb_write_begin,
-        .write_end = smb_write_end,
-};
-/* 
- * Write to a file (through the page cache).
- */
-static ssize_t
-smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos)
-{
-        struct file * file = iocb->ki_filp;
-        struct dentry * dentry = file->f_path.dentry;
-        ssize_t result;
-        VERBOSE("file %s/%s, count=%lu@%lu\n",
-                DENTRY_PATH(dentry),
-                (unsigned long) iocb->ki_left, (unsigned long) pos);
-        result = smb_revalidate_inode(dentry);
-        if (result) {
-                PARANOIA("%s/%s validation failed, error=%Zd\n",
-                         DENTRY_PATH(dentry), result);
-                goto out;
-        }
-        result = smb_open(dentry, SMB_O_WRONLY);
-        if (result)
-                goto out;
-        if (iocb->ki_left > 0) {
-                result = generic_file_aio_write(iocb, iov, nr_segs, pos);
-                VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
-                        (long) file->f_pos, (long) dentry->d_inode->i_size,
-                        dentry->d_inode->i_mtime.tv_sec,
-                        dentry->d_inode->i_atime.tv_sec);
-        }
-out:
-        return result;
-}
-static int
-smb_file_open(struct inode *inode, struct file * file)
-{
-        int result;
-        struct dentry *dentry = file->f_path.dentry;
-        int smb_mode = (file->f_mode & O_ACCMODE) - 1;
-        lock_kernel();
-        result = smb_open(dentry, smb_mode);
-        if (result)
-                goto out;
-        SMB_I(inode)->openers++;
-out:
-        unlock_kernel();
-        return result;
-}
-static int
-smb_file_release(struct inode *inode, struct file * file)
-{
-        lock_kernel();
-        if (!--SMB_I(inode)->openers) {
-                /* We must flush any dirty pages now as we won't be able to
-                   write anything after close. mmap can trigger this.
-                   "openers" should perhaps include mmap'ers ... */
-                filemap_write_and_wait(inode->i_mapping);
-                smb_close(inode);
-        }
-        unlock_kernel();
-        return 0;
-}
-/*
- * Check whether the required access is compatible with
- * an inode's permission. SMB doesn't recognize superuser
- * privileges, so we need our own check for this.
- */
-static int
-smb_file_permission(struct inode *inode, int mask)
-{
-        int mode = inode->i_mode;
-        int error = 0;
-        VERBOSE("mode=%x, mask=%x\n", mode, mask);
-        /* Look at user permissions */
-        mode >>= 6;
-        if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
-                error = -EACCES;
-        return error;
-}
-static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
-{
-        loff_t ret;
-        lock_kernel();
-        ret = generic_file_llseek_unlocked(file, offset, origin);
-        unlock_kernel();
-        return ret;
-}
-const struct file_operations smb_file_operations =
-{
-        .llseek         = smb_remote_llseek,
-        .read           = do_sync_read,
-        .aio_read       = smb_file_aio_read,
-        .write          = do_sync_write,
-        .aio_write      = smb_file_aio_write,
-        .unlocked_ioctl = smb_ioctl,
-        .mmap           = smb_file_mmap,
-        .open           = smb_file_open,
-        .release        = smb_file_release,
-        .fsync          = smb_fsync,
-        .splice_read    = smb_file_splice_read,
-};
-const struct inode_operations smb_file_inode_operations =
-{
-        .permission     = smb_file_permission,
-        .getattr        = smb_getattr,
-        .setattr        = smb_notify_change,
-};
diff --git a/fs/smbfs/getopt.c b/fs/smbfs/getopt.c
deleted file mode 100644
index 7ae0f5273ab1..000000000000
--- a/fs/smbfs/getopt.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * getopt.c
- */
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/net.h>
-#include "getopt.h"
-/**
- *      smb_getopt - option parser
- *      @caller: name of the caller, for error messages
- *      @options: the options string
- *      @opts: an array of &struct option entries controlling parser operations
- *      @optopt: output; will contain the current option
- *      @optarg: output; will contain the value (if one exists)
- *      @flag: output; may be NULL; should point to a long for or'ing flags
- *      @value: output; may be NULL; will be overwritten with the integer value
- *              of the current argument.
- *
- *      Helper to parse options on the format used by mount ("a=b,c=d,e,f").
- *      Returns opts->val if a matching entry in the 'opts' array is found,
- *      0 when no more tokens are found, -1 if an error is encountered.
- */
-int smb_getopt(char *caller, char **options, struct option *opts,
-               char **optopt, char **optarg, unsigned long *flag,
-               unsigned long *value)
-{
-        char *token;
-        char *val;
-        int i;
-        do {
-                if ((token = strsep(options, ",")) == NULL)
-                        return 0;
-        } while (*token == '\0');
-        *optopt = token;
-        *optarg = NULL;
-        if ((val = strchr (token, '=')) != NULL) {
-                *val++ = 0;
-                if (value)
-                        *value = simple_strtoul(val, NULL, 0);
-                *optarg = val;
-        }
-        for (i = 0; opts[i].name != NULL; i++) {
-                if (!strcmp(opts[i].name, token)) {
-                        if (!opts[i].flag && (!val || !*val)) {
-                                printk("%s: the %s option requires an argument\n",
-                                       caller, token);
-                                return -1;
-                        }
-                        if (flag && opts[i].flag)
-                                *flag |= opts[i].flag;
-                        return opts[i].val;
-                }
-        }
-        printk("%s: Unrecognized mount option %s\n", caller, token);
-        return -1;
-}
diff --git a/fs/smbfs/getopt.h b/fs/smbfs/getopt.h
deleted file mode 100644
index 146219ac7c46..000000000000
--- a/fs/smbfs/getopt.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _LINUX_GETOPT_H
-#define _LINUX_GETOPT_H
-struct option {
-        const char *name;
-        unsigned long flag;
-        int val;
-};
-extern int smb_getopt(char *caller, char **options, struct option *opts,
-                      char **optopt, char **optarg, unsigned long *flag,
-                      unsigned long *value);
-#endif /* _LINUX_GETOPT_H */
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
deleted file mode 100644
index 8fc5e50e142f..000000000000
--- a/fs/smbfs/inode.c
+++ /dev/null
@@ -1,844 +0,0 @@
-/*
- *  inode.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/file.h>
-#include <linux/dcache.h>
-#include <linux/smp_lock.h>
-#include <linux/nls.h>
-#include <linux/seq_file.h>
-#include <linux/mount.h>
-#include <linux/net.h>
-#include <linux/vfs.h>
-#include <linux/highuid.h>
-#include <linux/sched.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include "smb_debug.h"
-#include "getopt.h"
-#include "proto.h"
-/* Always pick a default string */
-#ifdef CONFIG_SMB_NLS_REMOTE
-#define SMB_NLS_REMOTE CONFIG_SMB_NLS_REMOTE
-#else
-#define SMB_NLS_REMOTE ""
-#endif
-#define SMB_TTL_DEFAULT 1000
-static void smb_evict_inode(struct inode *);
-static void smb_put_super(struct super_block *);
-static int  smb_statfs(struct dentry *, struct kstatfs *);
-static int  smb_show_options(struct seq_file *, struct vfsmount *);
-static struct kmem_cache *smb_inode_cachep;
-static struct inode *smb_alloc_inode(struct super_block *sb)
-{
-        struct smb_inode_info *ei;
-        ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL);
-        if (!ei)
-                return NULL;
-        return &ei->vfs_inode;
-}
-static void smb_destroy_inode(struct inode *inode)
-{
-        kmem_cache_free(smb_inode_cachep, SMB_I(inode));
-}
-static void init_once(void *foo)
-{
-        struct smb_inode_info *ei = (struct smb_inode_info *) foo;
-        inode_init_once(&ei->vfs_inode);
-}
-static int init_inodecache(void)
-{
-        smb_inode_cachep = kmem_cache_create("smb_inode_cache",
-                                             sizeof(struct smb_inode_info),
-                                             0, (SLAB_RECLAIM_ACCOUNT|
-                                                SLAB_MEM_SPREAD),
-                                             init_once);
-        if (smb_inode_cachep == NULL)
-                return -ENOMEM;
-        return 0;
-}
-static void destroy_inodecache(void)
-{
-        kmem_cache_destroy(smb_inode_cachep);
-}
-static int smb_remount(struct super_block *sb, int *flags, char *data)
-{
-        *flags |= MS_NODIRATIME;
-        return 0;
-}
-static const struct super_operations smb_sops =
-{
-        .alloc_inode    = smb_alloc_inode,
-        .destroy_inode  = smb_destroy_inode,
-        .drop_inode     = generic_delete_inode,
-        .evict_inode    = smb_evict_inode,
-        .put_super      = smb_put_super,
-        .statfs         = smb_statfs,
-        .show_options   = smb_show_options,
-        .remount_fs     = smb_remount,
-};
-/* We are always generating a new inode here */
-struct inode *
-smb_iget(struct super_block *sb, struct smb_fattr *fattr)
-{
-        struct smb_sb_info *server = SMB_SB(sb);
-        struct inode *result;
-        DEBUG1("smb_iget: %p\n", fattr);
-        result = new_inode(sb);
-        if (!result)
-                return result;
-        result->i_ino = fattr->f_ino;
-        SMB_I(result)->open = 0;
-        SMB_I(result)->fileid = 0;
-        SMB_I(result)->access = 0;
-        SMB_I(result)->flags = 0;
-        SMB_I(result)->closed = 0;
-        SMB_I(result)->openers = 0;
-        smb_set_inode_attr(result, fattr);
-        if (S_ISREG(result->i_mode)) {
-                result->i_op = &smb_file_inode_operations;
-                result->i_fop = &smb_file_operations;
-                result->i_data.a_ops = &smb_file_aops;
-        } else if (S_ISDIR(result->i_mode)) {
-                if (server->opt.capabilities & SMB_CAP_UNIX)
-                        result->i_op = &smb_dir_inode_operations_unix;
-                else
-                        result->i_op = &smb_dir_inode_operations;
-                result->i_fop = &smb_dir_operations;
-        } else if (S_ISLNK(result->i_mode)) {
-                result->i_op = &smb_link_inode_operations;
-        } else {
-                init_special_inode(result, result->i_mode, fattr->f_rdev);
-        }
-        insert_inode_hash(result);
-        return result;
-}
-/*
- * Copy the inode data to a smb_fattr structure.
- */
-void
-smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
-{
-        memset(fattr, 0, sizeof(struct smb_fattr));
-        fattr->f_mode   = inode->i_mode;
-        fattr->f_nlink  = inode->i_nlink;
-        fattr->f_ino    = inode->i_ino;
-        fattr->f_uid    = inode->i_uid;
-        fattr->f_gid    = inode->i_gid;
-        fattr->f_size   = inode->i_size;
-        fattr->f_mtime  = inode->i_mtime;
-        fattr->f_ctime  = inode->i_ctime;
-        fattr->f_atime  = inode->i_atime;
-        fattr->f_blocks = inode->i_blocks;
-        fattr->attr     = SMB_I(inode)->attr;
-        /*
-         * Keep the attributes in sync with the inode permissions.
-         */
-        if (fattr->f_mode & S_IWUSR)
-                fattr->attr &= ~aRONLY;
-        else
-                fattr->attr |= aRONLY;
-}
-/*
- * Update the inode, possibly causing it to invalidate its pages if mtime/size
- * is different from last time.
- */
-void
-smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
-{
-        struct smb_inode_info *ei = SMB_I(inode);
-        /*
-         * A size change should have a different mtime, or same mtime
-         * but different size.
-         */
-        time_t last_time = inode->i_mtime.tv_sec;
-        loff_t last_sz = inode->i_size;
-        inode->i_mode   = fattr->f_mode;
-        inode->i_nlink  = fattr->f_nlink;
-        inode->i_uid    = fattr->f_uid;
-        inode->i_gid    = fattr->f_gid;
-        inode->i_ctime  = fattr->f_ctime;
-        inode->i_blocks = fattr->f_blocks;
-        inode->i_size   = fattr->f_size;
-        inode->i_mtime  = fattr->f_mtime;
-        inode->i_atime  = fattr->f_atime;
-        ei->attr = fattr->attr;
-        /*
-         * Update the "last time refreshed" field for revalidation.
-         */
-        ei->oldmtime = jiffies;
-        if (inode->i_mtime.tv_sec != last_time || inode->i_size != last_sz) {
-                VERBOSE("%ld changed, old=%ld, new=%ld, oz=%ld, nz=%ld\n",
-                        inode->i_ino,
-                        (long) last_time, (long) inode->i_mtime.tv_sec,
-                        (long) last_sz, (long) inode->i_size);
-                if (!S_ISDIR(inode->i_mode))
-                        invalidate_remote_inode(inode);
-        }
-}
-/*
- * This is called if the connection has gone bad ...
- * try to kill off all the current inodes.
- */
-void
-smb_invalidate_inodes(struct smb_sb_info *server)
-{
-        VERBOSE("\n");
-        shrink_dcache_sb(SB_of(server));
-        invalidate_inodes(SB_of(server));
-}
-/*
- * This is called to update the inode attributes after
- * we've made changes to a file or directory.
- */
-static int
-smb_refresh_inode(struct dentry *dentry)
-{
-        struct inode *inode = dentry->d_inode;
-        int error;
-        struct smb_fattr fattr;
-        error = smb_proc_getattr(dentry, &fattr);
-        if (!error) {
-                smb_renew_times(dentry);
-                /*
-                 * Check whether the type part of the mode changed,
-                 * and don't update the attributes if it did.
-                 *
-                 * And don't dick with the root inode
-                 */
-                if (inode->i_ino == 2)
-                        return error;
-                if (S_ISLNK(inode->i_mode))
-                        return error;   /* VFS will deal with it */
-                if ((inode->i_mode & S_IFMT) == (fattr.f_mode & S_IFMT)) {
-                        smb_set_inode_attr(inode, &fattr);
-                } else {
-                        /*
-                         * Big trouble! The inode has become a new object,
-                         * so any operations attempted on it are invalid.
-                         *
-                         * To limit damage, mark the inode as bad so that
-                         * subsequent lookup validations will fail.
-                         */
-                        PARANOIA("%s/%s changed mode, %07o to %07o\n",
-                                 DENTRY_PATH(dentry),
-                                 inode->i_mode, fattr.f_mode);
-                        fattr.f_mode = inode->i_mode; /* save mode */
-                        make_bad_inode(inode);
-                        inode->i_mode = fattr.f_mode; /* restore mode */
-                        /*
-                         * No need to worry about unhashing the dentry: the
-                         * lookup validation will see that the inode is bad.
-                         * But we do want to invalidate the caches ...
-                         */
-                        if (!S_ISDIR(inode->i_mode))
-                                invalidate_remote_inode(inode);
-                        else
-                                smb_invalid_dir_cache(inode);
-                        error = -EIO;
-                }
-        }
-        return error;
-}
-/*
- * This is called when we want to check whether the inode
- * has changed on the server.  If it has changed, we must
- * invalidate our local caches.
- */
-int
-smb_revalidate_inode(struct dentry *dentry)
-{
-        struct smb_sb_info *s = server_from_dentry(dentry);
-        struct inode *inode = dentry->d_inode;
-        int error = 0;
-        DEBUG1("smb_revalidate_inode\n");
-        lock_kernel();
-        /*
-         * Check whether we've recently refreshed the inode.
-         */
-        if (time_before(jiffies, SMB_I(inode)->oldmtime + SMB_MAX_AGE(s))) {
-                VERBOSE("up-to-date, ino=%ld, jiffies=%lu, oldtime=%lu\n",
-                        inode->i_ino, jiffies, SMB_I(inode)->oldmtime);
-                goto out;
-        }
-        error = smb_refresh_inode(dentry);
-out:
-        unlock_kernel();
-        return error;
-}
-/*
- * This routine is called when i_nlink == 0 and i_count goes to 0.
- * All blocking cleanup operations need to go here to avoid races.
- */
-static void
-smb_evict_inode(struct inode *ino)
-{
-        DEBUG1("ino=%ld\n", ino->i_ino);
-        truncate_inode_pages(&ino->i_data, 0);
-        end_writeback(ino);
-        lock_kernel();
-        if (smb_close(ino))
-                PARANOIA("could not close inode %ld\n", ino->i_ino);
-        unlock_kernel();
-}
-static struct option opts[] = {
-        { "version",    0, 'v' },
-        { "win95",      SMB_MOUNT_WIN95, 1 },
-        { "oldattr",    SMB_MOUNT_OLDATTR, 1 },
-        { "dirattr",    SMB_MOUNT_DIRATTR, 1 },
-        { "case",       SMB_MOUNT_CASE, 1 },
-        { "uid",        0, 'u' },
-        { "gid",        0, 'g' },
-        { "file_mode",  0, 'f' },
-        { "dir_mode",   0, 'd' },
-        { "iocharset",  0, 'i' },
-        { "codepage",   0, 'c' },
-        { "ttl",        0, 't' },
-        { NULL,         0, 0}
-};
-static int
-parse_options(struct smb_mount_data_kernel *mnt, char *options)
-{
-        int c;
-        unsigned long flags;
-        unsigned long value;
-        char *optarg;
-        char *optopt;
-        flags = 0;
-        while ( (c = smb_getopt("smbfs", &options, opts,
-                                &optopt, &optarg, &flags, &value)) > 0) {
-                VERBOSE("'%s' -> '%s'\n", optopt, optarg ? optarg : "<none>");
-                switch (c) {
-                case 1:
-                        /* got a "flag" option */
-                        break;
-                case 'v':
-                        if (value != SMB_MOUNT_VERSION) {
-                        printk ("smbfs: Bad mount version %ld, expected %d\n",
-                                value, SMB_MOUNT_VERSION);
-                                return 0;
-                        }
-                        mnt->version = value;
-                        break;
-                case 'u':
-                        mnt->uid = value;
-                        flags |= SMB_MOUNT_UID;
-                        break;
-                case 'g':
-                        mnt->gid = value;
-                        flags |= SMB_MOUNT_GID;
-                        break;
-                case 'f':
-                        mnt->file_mode = (value & S_IRWXUGO) | S_IFREG;
-                        flags |= SMB_MOUNT_FMODE;
-                        break;
-                case 'd':
-                        mnt->dir_mode = (value & S_IRWXUGO) | S_IFDIR;
-                        flags |= SMB_MOUNT_DMODE;
-                        break;
-                case 'i':
-                        strlcpy(mnt->codepage.local_name, optarg, 
-                                SMB_NLS_MAXNAMELEN);
-                        break;
-                case 'c':
-                        strlcpy(mnt->codepage.remote_name, optarg,
-                                SMB_NLS_MAXNAMELEN);
-                        break;
-                case 't':
-                        mnt->ttl = value;
-                        break;
-                default:
-                        printk ("smbfs: Unrecognized mount option %s\n",
-                                optopt);
-                        return -1;
-                }
-        }
-        mnt->flags = flags;
-        return c;
-}
-/*
- * smb_show_options() is for displaying mount options in /proc/mounts.
- * It tries to avoid showing settings that were not changed from their
- * defaults.
- */
-static int
-smb_show_options(struct seq_file *s, struct vfsmount *m)
-{
-        struct smb_mount_data_kernel *mnt = SMB_SB(m->mnt_sb)->mnt;
-        int i;
-        for (i = 0; opts[i].name != NULL; i++)
-                if (mnt->flags & opts[i].flag)
-                        seq_printf(s, ",%s", opts[i].name);
-        if (mnt->flags & SMB_MOUNT_UID)
-                seq_printf(s, ",uid=%d", mnt->uid);
-        if (mnt->flags & SMB_MOUNT_GID)
-                seq_printf(s, ",gid=%d", mnt->gid);
-        if (mnt->mounted_uid != 0)
-                seq_printf(s, ",mounted_uid=%d", mnt->mounted_uid);
-        /* 
-         * Defaults for file_mode and dir_mode are unknown to us; they
-         * depend on the current umask of the user doing the mount.
-         */
-        if (mnt->flags & SMB_MOUNT_FMODE)
-                seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO);
-        if (mnt->flags & SMB_MOUNT_DMODE)
-                seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO);
-        if (strcmp(mnt->codepage.local_name, CONFIG_NLS_DEFAULT))
-                seq_printf(s, ",iocharset=%s", mnt->codepage.local_name);
-        if (strcmp(mnt->codepage.remote_name, SMB_NLS_REMOTE))
-                seq_printf(s, ",codepage=%s", mnt->codepage.remote_name);
-        if (mnt->ttl != SMB_TTL_DEFAULT)
-                seq_printf(s, ",ttl=%d", mnt->ttl);
-        return 0;
-}
-static void
-smb_unload_nls(struct smb_sb_info *server)
-{
-        unload_nls(server->remote_nls);
-        unload_nls(server->local_nls);
-}
-static void
-smb_put_super(struct super_block *sb)
-{
-        struct smb_sb_info *server = SMB_SB(sb);
-        lock_kernel();
-        smb_lock_server(server);
-        server->state = CONN_INVALID;
-        smbiod_unregister_server(server);
-        smb_close_socket(server);
-        if (server->conn_pid)
-                kill_pid(server->conn_pid, SIGTERM, 1);
-        bdi_destroy(&server->bdi);
-        kfree(server->ops);
-        smb_unload_nls(server);
-        sb->s_fs_info = NULL;
-        smb_unlock_server(server);
-        put_pid(server->conn_pid);
-        kfree(server);
-        unlock_kernel();
-}
-static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
-{
-        struct smb_sb_info *server;
-        struct smb_mount_data_kernel *mnt;
-        struct smb_mount_data *oldmnt;
-        struct inode *root_inode;
-        struct smb_fattr root;
-        int ver;
-        void *mem;
-        static int warn_count;
-        lock_kernel();
-        if (warn_count < 5) {
-                warn_count++;
-                printk(KERN_EMERG "smbfs is deprecated and will be removed"
-                        " from the 2.6.27 kernel. Please migrate to cifs\n");
-        }
-        if (!raw_data)
-                goto out_no_data;
-        oldmnt = (struct smb_mount_data *) raw_data;
-        ver = oldmnt->version;
-        if (ver != SMB_MOUNT_OLDVERSION && cpu_to_be32(ver) != SMB_MOUNT_ASCII)
-                goto out_wrong_data;
-        sb->s_flags |= MS_NODIRATIME;
-        sb->s_blocksize = 1024; /* Eh...  Is this correct? */
-        sb->s_blocksize_bits = 10;
-        sb->s_magic = SMB_SUPER_MAGIC;
-        sb->s_op = &smb_sops;
-        sb->s_time_gran = 100;
-        server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
-        if (!server)
-                goto out_no_server;
-        sb->s_fs_info = server;
-        
-        if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
-                goto out_bdi;
-        sb->s_bdi = &server->bdi;
-        server->super_block = sb;
-        server->mnt = NULL;
-        server->sock_file = NULL;
-        init_waitqueue_head(&server->conn_wq);
-        init_MUTEX(&server->sem);
-        INIT_LIST_HEAD(&server->entry);
-        INIT_LIST_HEAD(&server->xmitq);
-        INIT_LIST_HEAD(&server->recvq);
-        server->conn_error = 0;
-        server->conn_pid = NULL;
-        server->state = CONN_INVALID; /* no connection yet */
-        server->generation = 0;
-        /* Allocate the global temp buffer and some superblock helper structs */
-        /* FIXME: move these to the smb_sb_info struct */
-        VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) +
-                sizeof(struct smb_mount_data_kernel));
-        mem = kmalloc(sizeof(struct smb_ops) +
-                      sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
-        if (!mem)
-                goto out_no_mem;
-        server->ops = mem;
-        smb_install_null_ops(server->ops);
-        server->mnt = mem + sizeof(struct smb_ops);
-        /* Setup NLS stuff */
-        server->remote_nls = NULL;
-        server->local_nls = NULL;
-        mnt = server->mnt;
-        memset(mnt, 0, sizeof(struct smb_mount_data_kernel));
-        strlcpy(mnt->codepage.local_name, CONFIG_NLS_DEFAULT,
-                SMB_NLS_MAXNAMELEN);
-        strlcpy(mnt->codepage.remote_name, SMB_NLS_REMOTE,
-                SMB_NLS_MAXNAMELEN);
-        mnt->ttl = SMB_TTL_DEFAULT;
-        if (ver == SMB_MOUNT_OLDVERSION) {
-                mnt->version = oldmnt->version;
-                SET_UID(mnt->uid, oldmnt->uid);
-                SET_GID(mnt->gid, oldmnt->gid);
-                mnt->file_mode = (oldmnt->file_mode & S_IRWXUGO) | S_IFREG;
-                mnt->dir_mode = (oldmnt->dir_mode & S_IRWXUGO) | S_IFDIR;
-                mnt->flags = (oldmnt->file_mode >> 9) | SMB_MOUNT_UID |
-                        SMB_MOUNT_GID | SMB_MOUNT_FMODE | SMB_MOUNT_DMODE;
-        } else {
-                mnt->file_mode = S_IRWXU | S_IRGRP | S_IXGRP |
-                                S_IROTH | S_IXOTH | S_IFREG;
-                mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP |
-                                S_IROTH | S_IXOTH | S_IFDIR;
-                if (parse_options(mnt, raw_data))
-                        goto out_bad_option;
-        }
-        mnt->mounted_uid = current_uid();
-        smb_setcodepage(server, &mnt->codepage);
-        /*
-         * Display the enabled options
-         * Note: smb_proc_getattr uses these in 2.4 (but was changed in 2.2)
-         */
-        if (mnt->flags & SMB_MOUNT_OLDATTR)
-                printk("SMBFS: Using core getattr (Win 95 speedup)\n");
-        else if (mnt->flags & SMB_MOUNT_DIRATTR)
-                printk("SMBFS: Using dir ff getattr\n");
-        if (smbiod_register_server(server) < 0) {
-                printk(KERN_ERR "smbfs: failed to start smbiod\n");
-                goto out_no_smbiod;
-        }
-        /*
-         * Keep the super block locked while we get the root inode.
-         */
-        smb_init_root_dirent(server, &root, sb);
-        root_inode = smb_iget(sb, &root);
-        if (!root_inode)
-                goto out_no_root;
-        sb->s_root = d_alloc_root(root_inode);
-        if (!sb->s_root)
-                goto out_no_root;
-        smb_new_dentry(sb->s_root);
-        unlock_kernel();
-        return 0;
-out_no_root:
-        iput(root_inode);
-out_no_smbiod:
-        smb_unload_nls(server);
-out_bad_option:
-        kfree(mem);
-out_no_mem:
-        bdi_destroy(&server->bdi);
-out_bdi:
-        if (!server->mnt)
-                printk(KERN_ERR "smb_fill_super: allocation failure\n");
-        sb->s_fs_info = NULL;
-        kfree(server);
-        goto out_fail;
-out_wrong_data:
-        printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver);
-        goto out_fail;
-out_no_data:
-        printk(KERN_ERR "smb_fill_super: missing data argument\n");
-out_fail:
-        unlock_kernel();
-        return -EINVAL;
-out_no_server:
-        printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n");
-        unlock_kernel();
-        return -ENOMEM;
-}
-static int
-smb_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        int result;
-        
-        lock_kernel();
-        result = smb_proc_dskattr(dentry, buf);
-        unlock_kernel();
-        buf->f_type = SMB_SUPER_MAGIC;
-        buf->f_namelen = SMB_MAXPATHLEN;
-        return result;
-}
-int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
-{
-        int err = smb_revalidate_inode(dentry);
-        if (!err)
-                generic_fillattr(dentry->d_inode, stat);
-        return err;
-}
-int
-smb_notify_change(struct dentry *dentry, struct iattr *attr)
-{
-        struct inode *inode = dentry->d_inode;
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        unsigned int mask = (S_IFREG | S_IFDIR | S_IRWXUGO);
-        int error, changed, refresh = 0;
-        struct smb_fattr fattr;
-        lock_kernel();
-        error = smb_revalidate_inode(dentry);
-        if (error)
-                goto out;
-        if ((error = inode_change_ok(inode, attr)) < 0)
-                goto out;
-        error = -EPERM;
-        if ((attr->ia_valid & ATTR_UID) && (attr->ia_uid != server->mnt->uid))
-                goto out;
-        if ((attr->ia_valid & ATTR_GID) && (attr->ia_uid != server->mnt->gid))
-                goto out;
-        if ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~mask))
-                goto out;
-        if ((attr->ia_valid & ATTR_SIZE) != 0) {
-                VERBOSE("changing %s/%s, old size=%ld, new size=%ld\n",
-                        DENTRY_PATH(dentry),
-                        (long) inode->i_size, (long) attr->ia_size);
-                filemap_write_and_wait(inode->i_mapping);
-                error = smb_open(dentry, O_WRONLY);
-                if (error)
-                        goto out;
-                error = server->ops->truncate(inode, attr->ia_size);
-                if (error)
-                        goto out;
-                truncate_setsize(inode, attr->ia_size);
-                refresh = 1;
-        }
-        if (server->opt.capabilities & SMB_CAP_UNIX) {
-                /* For now we don't want to set the size with setattr_unix */
-                attr->ia_valid &= ~ATTR_SIZE;
-                /* FIXME: only call if we actually want to set something? */
-                error = smb_proc_setattr_unix(dentry, attr, 0, 0);
-                if (!error)
-                        refresh = 1;
-                goto out;
-        }
-        /*
-         * Initialize the fattr and check for changed fields.
-         * Note: CTIME under SMB is creation time rather than
-         * change time, so we don't attempt to change it.
-         */
-        smb_get_inode_attr(inode, &fattr);
-        changed = 0;
-        if ((attr->ia_valid & ATTR_MTIME) != 0) {
-                fattr.f_mtime = attr->ia_mtime;
-                changed = 1;
-        }
-        if ((attr->ia_valid & ATTR_ATIME) != 0) {
-                fattr.f_atime = attr->ia_atime;
-                /* Earlier protocols don't have an access time */
-                if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2)
-                        changed = 1;
-        }
-        if (changed) {
-                error = smb_proc_settime(dentry, &fattr);
-                if (error)
-                        goto out;
-                refresh = 1;
-        }
-        /*
-         * Check for mode changes ... we're extremely limited in
-         * what can be set for SMB servers: just the read-only bit.
-         */
-        if ((attr->ia_valid & ATTR_MODE) != 0) {
-                VERBOSE("%s/%s mode change, old=%x, new=%x\n",
-                        DENTRY_PATH(dentry), fattr.f_mode, attr->ia_mode);
-                changed = 0;
-                if (attr->ia_mode & S_IWUSR) {
-                        if (fattr.attr & aRONLY) {
-                                fattr.attr &= ~aRONLY;
-                                changed = 1;
-                        }
-                } else {
-                        if (!(fattr.attr & aRONLY)) {
-                                fattr.attr |= aRONLY;
-                                changed = 1;
-                        }
-                }
-                if (changed) {
-                        error = smb_proc_setattr(dentry, &fattr);
-                        if (error)
-                                goto out;
-                        refresh = 1;
-                }
-        }
-        error = 0;
-out:
-        if (refresh)
-                smb_refresh_inode(dentry);
-        unlock_kernel();
-        return error;
-}
-static int smb_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
-}
-static struct file_system_type smb_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "smbfs",
-        .get_sb         = smb_get_sb,
-        .kill_sb        = kill_anon_super,
-        .fs_flags       = FS_BINARY_MOUNTDATA,
-};
-static int __init init_smb_fs(void)
-{
-        int err;
-        DEBUG1("registering ...\n");
-        err = init_inodecache();
-        if (err)
-                goto out_inode;
-        err = smb_init_request_cache();
-        if (err)
-                goto out_request;
-        err = register_filesystem(&smb_fs_type);
-        if (err)
-                goto out;
-        return 0;
-out:
-        smb_destroy_request_cache();
-out_request:
-        destroy_inodecache();
-out_inode:
-        return err;
-}
-static void __exit exit_smb_fs(void)
-{
-        DEBUG1("unregistering ...\n");
-        unregister_filesystem(&smb_fs_type);
-        smb_destroy_request_cache();
-        destroy_inodecache();
-}
-module_init(init_smb_fs)
-module_exit(exit_smb_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
deleted file mode 100644
index 07215312ad39..000000000000
--- a/fs/smbfs/ioctl.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *  ioctl.c
- *
- *  Copyright (C) 1995, 1996 by Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/ioctl.h>
-#include <linux/time.h>
-#include <linux/mm.h>
-#include <linux/highuid.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/smb_fs.h>
-#include <linux/smb_mount.h>
-#include <asm/uaccess.h>
-#include "proto.h"
-long
-smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-        struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
-        struct smb_conn_opt opt;
-        int result = -EINVAL;
-        lock_kernel();
-        switch (cmd) {
-                uid16_t uid16;
-                uid_t uid32;
-        case SMB_IOC_GETMOUNTUID:
-                SET_UID(uid16, server->mnt->mounted_uid);
-                result = put_user(uid16, (uid16_t __user *) arg);
-                break;
-        case SMB_IOC_GETMOUNTUID32:
-                SET_UID(uid32, server->mnt->mounted_uid);
-                result = put_user(uid32, (uid_t __user *) arg);
-                break;
-        case SMB_IOC_NEWCONN:
-                /* arg is smb_conn_opt, or NULL if no connection was made */
-                if (!arg) {
-                        result = 0;
-                        smb_lock_server(server);
-                        server->state = CONN_RETRIED;
-                        printk(KERN_ERR "Connection attempt failed!  [%d]\n",
-                               server->conn_error);
-                        smbiod_flush(server);
-                        smb_unlock_server(server);
-                        break;
-                }
-                result = -EFAULT;
-                if (!copy_from_user(&opt, (void __user *)arg, sizeof(opt)))
-                        result = smb_newconn(server, &opt);
-                break;
-        default:
-                break;
-        }
-        unlock_kernel();
-        return result;
-}
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
deleted file mode 100644
index 71c29b6670b4..000000000000
--- a/fs/smbfs/proc.c
+++ /dev/null
@@ -1,3507 +0,0 @@
-/*
- *  proc.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/dcache.h>
-#include <linux/nls.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/vfs.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-#include <net/sock.h>
-#include <asm/string.h>
-#include <asm/div64.h>
-#include "smb_debug.h"
-#include "proto.h"
-#include "request.h"
-/* Features. Undefine if they cause problems, this should perhaps be a
-   config option. */
-#define SMBFS_POSIX_UNLINK 1
-/* Allow smb_retry to be interrupted. */
-#define SMB_RETRY_INTR
-#define SMB_VWV(packet)  ((packet) + SMB_HEADER_LEN)
-#define SMB_CMD(packet)  (*(packet+8))
-#define SMB_WCT(packet)  (*(packet+SMB_HEADER_LEN - 1))
-#define SMB_DIRINFO_SIZE 43
-#define SMB_STATUS_SIZE  21
-#define SMB_ST_BLKSIZE  (PAGE_SIZE)
-#define SMB_ST_BLKSHIFT (PAGE_SHIFT)
-static struct smb_ops smb_ops_core;
-static struct smb_ops smb_ops_os2;
-static struct smb_ops smb_ops_win95;
-static struct smb_ops smb_ops_winNT;
-static struct smb_ops smb_ops_unix;
-static struct smb_ops smb_ops_null;
-static void
-smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
-static void
-smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
-static int
-smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
-                      struct smb_fattr *fattr);
-static int
-smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
-                    struct smb_fattr *fattr);
-static int
-smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
-                      u16 attr);
-static int
-smb_proc_setattr_ext(struct smb_sb_info *server,
-                     struct inode *inode, struct smb_fattr *fattr);
-static int
-smb_proc_query_cifsunix(struct smb_sb_info *server);
-static void
-install_ops(struct smb_ops *dst, struct smb_ops *src);
-static void
-str_upper(char *name, int len)
-{
-        while (len--)
-        {
-                if (*name >= 'a' && *name <= 'z')
-                        *name -= ('a' - 'A');
-                name++;
-        }
-}
-#if 0
-static void
-str_lower(char *name, int len)
-{
-        while (len--)
-        {
-                if (*name >= 'A' && *name <= 'Z')
-                        *name += ('a' - 'A');
-                name++;
-        }
-}
-#endif
-/* reverse a string inline. This is used by the dircache walking routines */
-static void reverse_string(char *buf, int len)
-{
-        char c;
-        char *end = buf+len-1;
-        while(buf < end) {
-                c = *buf;
-                *(buf++) = *end;
-                *(end--) = c;
-        }
-}
-/* no conversion, just a wrapper for memcpy. */
-static int convert_memcpy(unsigned char *output, int olen,
-                          const unsigned char *input, int ilen,
-                          struct nls_table *nls_from,
-                          struct nls_table *nls_to)
-{
-        if (olen < ilen)
-                return -ENAMETOOLONG;
-        memcpy(output, input, ilen);
-        return ilen;
-}
-static inline int write_char(unsigned char ch, char *output, int olen)
-{
-        if (olen < 4)
-                return -ENAMETOOLONG;
-        sprintf(output, ":x%02x", ch);
-        return 4;
-}
-static inline int write_unichar(wchar_t ch, char *output, int olen)
-{
-        if (olen < 5)
-                return -ENAMETOOLONG;
-        sprintf(output, ":%04x", ch);
-        return 5;
-}
-/* convert from one "codepage" to another (possibly being utf8). */
-static int convert_cp(unsigned char *output, int olen,
-                      const unsigned char *input, int ilen,
-                      struct nls_table *nls_from,
-                      struct nls_table *nls_to)
-{
-        int len = 0;
-        int n;
-        wchar_t ch;
-        while (ilen > 0) {
-                /* convert by changing to unicode and back to the new cp */
-                n = nls_from->char2uni(input, ilen, &ch);
-                if (n == -EINVAL) {
-                        ilen--;
-                        n = write_char(*input++, output, olen);
-                        if (n < 0)
-                                goto fail;
-                        output += n;
-                        olen -= n;
-                        len += n;
-                        continue;
-                } else if (n < 0)
-                        goto fail;
-                input += n;
-                ilen -= n;
-                n = nls_to->uni2char(ch, output, olen);
-                if (n == -EINVAL)
-                        n = write_unichar(ch, output, olen);
-                if (n < 0)
-                        goto fail;
-                output += n;
-                olen -= n;
-                len += n;
-        }
-        return len;
-fail:
-        return n;
-}
-/* ----------------------------------------------------------- */
-/*
- * nls_unicode
- *
- * This encodes/decodes little endian unicode format
- */
-static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
-{
-        if (boundlen < 2)
-                return -EINVAL;
-        *out++ = uni & 0xff;
-        *out++ = uni >> 8;
-        return 2;
-}
-static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
-{
-        if (boundlen < 2)
-                return -EINVAL;
-        *uni = (rawstring[1] << 8) | rawstring[0];
-        return 2;
-}
-static struct nls_table unicode_table = {
-        .charset        = "unicode",
-        .uni2char       = uni2char,
-        .char2uni       = char2uni,
-};
-/* ----------------------------------------------------------- */
-static int setcodepage(struct nls_table **p, char *name)
-{
-        struct nls_table *nls;
-        if (!name || !*name) {
-                nls = NULL;
-        } else if ( (nls = load_nls(name)) == NULL) {
-                printk (KERN_ERR "smbfs: failed to load nls '%s'\n", name);
-                return -EINVAL;
-        }
-        /* if already set, unload the previous one. */
-        if (*p && *p != &unicode_table)
-                unload_nls(*p);
-        *p = nls;
-        return 0;
-}
-/* Handles all changes to codepage settings. */
-int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp)
-{
-        int n = 0;
-        smb_lock_server(server);
-        /* Don't load any nls_* at all, if no remote is requested */
-        if (!*cp->remote_name)
-                goto out;
-        /* local */
-        n = setcodepage(&server->local_nls, cp->local_name);
-        if (n != 0)
-                goto out;
-        /* remote */
-        if (!strcmp(cp->remote_name, "unicode")) {
-                server->remote_nls = &unicode_table;
-        } else {
-                n = setcodepage(&server->remote_nls, cp->remote_name);
-                if (n != 0)
-                        setcodepage(&server->local_nls, NULL);
-        }
-out:
-        if (server->local_nls != NULL && server->remote_nls != NULL)
-                server->ops->convert = convert_cp;
-        else
-                server->ops->convert = convert_memcpy;
-        smb_unlock_server(server);
-        return n;
-}
-/*****************************************************************************/
-/*                                                                           */
-/*  Encoding/Decoding section                                                */
-/*                                                                           */
-/*****************************************************************************/
-static __u8 *
-smb_encode_smb_length(__u8 * p, __u32 len)
-{
-        *p = 0;
-        *(p+1) = 0;
-        *(p+2) = (len & 0xFF00) >> 8;
-        *(p+3) = (len & 0xFF);
-        if (len > 0xFFFF)
-        {
-                *(p+1) = 1;
-        }
-        return p + 4;
-}
-/*
- * smb_build_path: build the path to entry and name storing it in buf.
- * The path returned will have the trailing '\0'.
- */
-static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
-                          int maxlen,
-                          struct dentry *entry, struct qstr *name)
-{
-        unsigned char *path = buf;
-        int len;
-        int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE) != 0;
-        if (maxlen < (2<<unicode))
-                return -ENAMETOOLONG;
-        if (maxlen > SMB_MAXPATHLEN + 1)
-                maxlen = SMB_MAXPATHLEN + 1;
-        if (entry == NULL)
-                goto test_name_and_out;
-        /*
-         * If IS_ROOT, we have to do no walking at all.
-         */
-        if (IS_ROOT(entry) && !name) {
-                *path++ = '\\';
-                if (unicode) *path++ = '\0';
-                *path++ = '\0';
-                if (unicode) *path++ = '\0';
-                return path-buf;
-        }
-        /*
-         * Build the path string walking the tree backward from end to ROOT
-         * and store it in reversed order [see reverse_string()]
-         */
-        dget(entry);
-        spin_lock(&entry->d_lock);
-        while (!IS_ROOT(entry)) {
-                struct dentry *parent;
-                if (maxlen < (3<<unicode)) {
-                        spin_unlock(&entry->d_lock);
-                        dput(entry);
-                        return -ENAMETOOLONG;
-                }
-                len = server->ops->convert(path, maxlen-2, 
-                                      entry->d_name.name, entry->d_name.len,
-                                      server->local_nls, server->remote_nls);
-                if (len < 0) {
-                        spin_unlock(&entry->d_lock);
-                        dput(entry);
-                        return len;
-                }
-                reverse_string(path, len);
-                path += len;
-                if (unicode) {
-                        /* Note: reverse order */
-                        *path++ = '\0';
-                        maxlen--;
-                }
-                *path++ = '\\';
-                maxlen -= len+1;
-                parent = entry->d_parent;
-                dget(parent);
-                spin_unlock(&entry->d_lock);
-                dput(entry);
-                entry = parent;
-                spin_lock(&entry->d_lock);
-        }
-        spin_unlock(&entry->d_lock);
-        dput(entry);
-        reverse_string(buf, path-buf);
-        /* maxlen has space for at least one char */
-test_name_and_out:
-        if (name) {
-                if (maxlen < (3<<unicode))
-                        return -ENAMETOOLONG;
-                *path++ = '\\';
-                if (unicode) {
-                        *path++ = '\0';
-                        maxlen--;
-                }
-                len = server->ops->convert(path, maxlen-2, 
-                                      name->name, name->len,
-                                      server->local_nls, server->remote_nls);
-                if (len < 0)
-                        return len;
-                path += len;
-                maxlen -= len+1;
-        }
-        /* maxlen has space for at least one char */
-        *path++ = '\0';
-        if (unicode) *path++ = '\0';
-        return path-buf;
-}
-static int smb_encode_path(struct smb_sb_info *server, char *buf, int maxlen,
-                           struct dentry *dir, struct qstr *name)
-{
-        int result;
-        result = smb_build_path(server, buf, maxlen, dir, name);
-        if (result < 0)
-                goto out;
-        if (server->opt.protocol <= SMB_PROTOCOL_COREPLUS)
-                str_upper(buf, result);
-out:
-        return result;
-}
-/* encode_path for non-trans2 request SMBs */
-static int smb_simple_encode_path(struct smb_request *req, char **p,
-                                  struct dentry * entry, struct qstr * name)
-{
-        struct smb_sb_info *server = req->rq_server;
-        char *s = *p;
-        int res;
-        int maxlen = ((char *)req->rq_buffer + req->rq_bufsize) - s;
-        int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
-        if (!maxlen)
-                return -ENAMETOOLONG;
-        *s++ = 4;       /* ASCII data format */
-        /*
-         * SMB Unicode strings must be 16bit aligned relative the start of the
-         * packet. If they are not they must be padded with 0.
-         */
-        if (unicode) {
-                int align = s - (char *)req->rq_buffer;
-                if (!(align & 1)) {
-                        *s++ = '\0';
-                        maxlen--;
-                }
-        }
-        res = smb_encode_path(server, s, maxlen-1, entry, name);
-        if (res < 0)
-                return res;
-        *p = s + res;
-        return 0;
-}
-/* The following are taken directly from msdos-fs */
-/* Linear day numbers of the respective 1sts in non-leap years. */
-static int day_n[] =
-{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
-                  /* JanFebMarApr May Jun Jul Aug Sep Oct Nov Dec */
-static time_t
-utc2local(struct smb_sb_info *server, time_t time)
-{
-        return time - server->opt.serverzone*60;
-}
-static time_t
-local2utc(struct smb_sb_info *server, time_t time)
-{
-        return time + server->opt.serverzone*60;
-}
-/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
-static time_t
-date_dos2unix(struct smb_sb_info *server, __u16 date, __u16 time)
-{
-        int month, year;
-        time_t secs;
-        /* first subtract and mask after that... Otherwise, if
-           date == 0, bad things happen */
-        month = ((date >> 5) - 1) & 15;
-        year = date >> 9;
-        secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + 86400 *
-            ((date & 31) - 1 + day_n[month] + (year / 4) + year * 365 - ((year & 3) == 0 &&
-                                                   month < 2 ? 1 : 0) + 3653);
-        /* days since 1.1.70 plus 80's leap day */
-        return local2utc(server, secs);
-}
-/* Convert linear UNIX date to a MS-DOS time/date pair. */
-static void
-date_unix2dos(struct smb_sb_info *server,
-              int unix_date, __u16 *date, __u16 *time)
-{
-        int day, year, nl_day, month;
-        unix_date = utc2local(server, unix_date);
-        if (unix_date < 315532800)
-                unix_date = 315532800;
-        *time = (unix_date % 60) / 2 +
-                (((unix_date / 60) % 60) << 5) +
-                (((unix_date / 3600) % 24) << 11);
-        day = unix_date / 86400 - 3652;
-        year = day / 365;
-        if ((year + 3) / 4 + 365 * year > day)
-                year--;
-        day -= (year + 3) / 4 + 365 * year;
-        if (day == 59 && !(year & 3)) {
-                nl_day = day;
-                month = 2;
-        } else {
-                nl_day = (year & 3) || day <= 59 ? day : day - 1;
-                for (month = 1; month < 12; month++)
-                        if (day_n[month] > nl_day)
-                                break;
-        }
-        *date = nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9);
-}
-/* The following are taken from fs/ntfs/util.c */
-#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
-/*
- * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
- * into Unix UTC (based 1970-01-01, in seconds).
- */
-static struct timespec
-smb_ntutc2unixutc(u64 ntutc)
-{
-        struct timespec ts;
-        /* FIXME: what about the timezone difference? */
-        /* Subtract the NTFS time offset, then convert to 1s intervals. */
-        u64 t = ntutc - NTFS_TIME_OFFSET;
-        ts.tv_nsec = do_div(t, 10000000) * 100;
-        ts.tv_sec = t; 
-        return ts;
-}
-/* Convert the Unix UTC into NT time */
-static u64
-smb_unixutc2ntutc(struct timespec ts)
-{
-        /* Note: timezone conversion is probably wrong. */
-        /* return ((u64)utc2local(server, t)) * 10000000 + NTFS_TIME_OFFSET; */
-        return ((u64)ts.tv_sec) * 10000000 + ts.tv_nsec/100 + NTFS_TIME_OFFSET;
-}
-#define MAX_FILE_MODE   6
-static mode_t file_mode[] = {
-        S_IFREG, S_IFDIR, S_IFLNK, S_IFCHR, S_IFBLK, S_IFIFO, S_IFSOCK
-};
-static int smb_filetype_to_mode(u32 filetype)
-{
-        if (filetype > MAX_FILE_MODE) {
-                PARANOIA("Filetype out of range: %d\n", filetype);
-                return S_IFREG;
-        }
-        return file_mode[filetype];
-}
-static u32 smb_filetype_from_mode(int mode)
-{
-        if (S_ISREG(mode))
-                return UNIX_TYPE_FILE;
-        if (S_ISDIR(mode))
-                return UNIX_TYPE_DIR;
-        if (S_ISLNK(mode))
-                return UNIX_TYPE_SYMLINK;
-        if (S_ISCHR(mode))
-                return UNIX_TYPE_CHARDEV;
-        if (S_ISBLK(mode))
-                return UNIX_TYPE_BLKDEV;
-        if (S_ISFIFO(mode))
-                return UNIX_TYPE_FIFO;
-        if (S_ISSOCK(mode))
-                return UNIX_TYPE_SOCKET;
-        return UNIX_TYPE_UNKNOWN;
-}
-/*****************************************************************************/
-/*                                                                           */
-/*  Support section.                                                         */
-/*                                                                           */
-/*****************************************************************************/
-__u32
-smb_len(__u8 * p)
-{
-        return ((*(p+1) & 0x1) << 16L) | (*(p+2) << 8L) | *(p+3);
-}
-static __u16
-smb_bcc(__u8 * packet)
-{
-        int pos = SMB_HEADER_LEN + SMB_WCT(packet) * sizeof(__u16);
-        return WVAL(packet, pos);
-}
-/* smb_valid_packet: We check if packet fulfills the basic
-   requirements of a smb packet */
-static int
-smb_valid_packet(__u8 * packet)
-{
-        return (packet[4] == 0xff
-                && packet[5] == 'S'
-                && packet[6] == 'M'
-                && packet[7] == 'B'
-                && (smb_len(packet) + 4 == SMB_HEADER_LEN
-                    + SMB_WCT(packet) * 2 + smb_bcc(packet)));
-}
-/* smb_verify: We check if we got the answer we expected, and if we
-   got enough data. If bcc == -1, we don't care. */
-static int
-smb_verify(__u8 * packet, int command, int wct, int bcc)
-{
-        if (SMB_CMD(packet) != command)
-                goto bad_command;
-        if (SMB_WCT(packet) < wct)
-                goto bad_wct;
-        if (bcc != -1 && smb_bcc(packet) < bcc)
-                goto bad_bcc;
-        return 0;
-bad_command:
-        printk(KERN_ERR "smb_verify: command=%x, SMB_CMD=%x??\n",
-               command, SMB_CMD(packet));
-        goto fail;
-bad_wct:
-        printk(KERN_ERR "smb_verify: command=%x, wct=%d, SMB_WCT=%d??\n",
-               command, wct, SMB_WCT(packet));
-        goto fail;
-bad_bcc:
-        printk(KERN_ERR "smb_verify: command=%x, bcc=%d, SMB_BCC=%d??\n",
-               command, bcc, smb_bcc(packet));
-fail:
-        return -EIO;
-}
-/*
- * Returns the maximum read or write size for the "payload". Making all of the
- * packet fit within the negotiated max_xmit size.
- *
- * N.B. Since this value is usually computed before locking the server,
- * the server's packet size must never be decreased!
- */
-static inline int
-smb_get_xmitsize(struct smb_sb_info *server, int overhead)
-{
-        return server->opt.max_xmit - overhead;
-}
-/*
- * Calculate the maximum read size
- */
-int
-smb_get_rsize(struct smb_sb_info *server)
-{
-        /* readX has 12 parameters, read has 5 */
-        int overhead = SMB_HEADER_LEN + 12 * sizeof(__u16) + 2 + 1 + 2;
-        int size = smb_get_xmitsize(server, overhead);
-        VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
-        return size;
-}
-/*
- * Calculate the maximum write size
- */
-int
-smb_get_wsize(struct smb_sb_info *server)
-{
-        /* writeX has 14 parameters, write has 5 */
-        int overhead = SMB_HEADER_LEN + 14 * sizeof(__u16) + 2 + 1 + 2;
-        int size = smb_get_xmitsize(server, overhead);
-        VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
-        return size;
-}
-/*
- * Convert SMB error codes to -E... errno values.
- */
-int
-smb_errno(struct smb_request *req)
-{
-        int errcls = req->rq_rcls;
-        int error  = req->rq_err;
-        char *class = "Unknown";
-        VERBOSE("errcls %d  code %d  from command 0x%x\n",
-                errcls, error, SMB_CMD(req->rq_header));
-        if (errcls == ERRDOS) {
-                switch (error) {
-                case ERRbadfunc:
-                        return -EINVAL;
-                case ERRbadfile:
-                case ERRbadpath:
-                        return -ENOENT;
-                case ERRnofids:
-                        return -EMFILE;
-                case ERRnoaccess:
-                        return -EACCES;
-                case ERRbadfid:
-                        return -EBADF;
-                case ERRbadmcb:
-                        return -EREMOTEIO;
-                case ERRnomem:
-                        return -ENOMEM;
-                case ERRbadmem:
-                        return -EFAULT;
-                case ERRbadenv:
-                case ERRbadformat:
-                        return -EREMOTEIO;
-                case ERRbadaccess:
-                        return -EACCES;
-                case ERRbaddata:
-                        return -E2BIG;
-                case ERRbaddrive:
-                        return -ENXIO;
-                case ERRremcd:
-                        return -EREMOTEIO;
-                case ERRdiffdevice:
-                        return -EXDEV;
-                case ERRnofiles:
-                        return -ENOENT;
-                case ERRbadshare:
-                        return -ETXTBSY;
-                case ERRlock:
-                        return -EDEADLK;
-                case ERRfilexists:
-                        return -EEXIST;
-                case ERROR_INVALID_PARAMETER:
-                        return -EINVAL;
-                case ERROR_DISK_FULL:
-                        return -ENOSPC;
-                case ERROR_INVALID_NAME:
-                        return -ENOENT;
-                case ERROR_DIR_NOT_EMPTY:
-                        return -ENOTEMPTY;
-                case ERROR_NOT_LOCKED:
-                       return -ENOLCK;
-                case ERROR_ALREADY_EXISTS:
-                        return -EEXIST;
-                default:
-                        class = "ERRDOS";
-                        goto err_unknown;
-                }
-        } else if (errcls == ERRSRV) {
-                switch (error) {
-                /* N.B. This is wrong ... EIO ? */
-                case ERRerror:
-                        return -ENFILE;
-                case ERRbadpw:
-                        return -EINVAL;
-                case ERRbadtype:
-                case ERRtimeout:
-                        return -EIO;
-                case ERRaccess:
-                        return -EACCES;
-                /*
-                 * This is a fatal error, as it means the "tree ID"
-                 * for this connection is no longer valid. We map
-                 * to a special error code and get a new connection.
-                 */
-                case ERRinvnid:
-                        return -EBADSLT;
-                default:
-                        class = "ERRSRV";
-                        goto err_unknown;
-                }
-        } else if (errcls == ERRHRD) {
-                switch (error) {
-                case ERRnowrite:
-                        return -EROFS;
-                case ERRbadunit:
-                        return -ENODEV;
-                case ERRnotready:
-                        return -EUCLEAN;
-                case ERRbadcmd:
-                case ERRdata:
-                        return -EIO;
-                case ERRbadreq:
-                        return -ERANGE;
-                case ERRbadshare:
-                        return -ETXTBSY;
-                case ERRlock:
-                        return -EDEADLK;
-                case ERRdiskfull:
-                        return -ENOSPC;
-                default:
-                        class = "ERRHRD";
-                        goto err_unknown;
-                }
-        } else if (errcls == ERRCMD) {
-                class = "ERRCMD";
-        } else if (errcls == SUCCESS) {
-                return 0;       /* This is the only valid 0 return */
-        }
-err_unknown:
-        printk(KERN_ERR "smb_errno: class %s, code %d from command 0x%x\n",
-               class, error, SMB_CMD(req->rq_header));
-        return -EIO;
-}
-/* smb_request_ok: We expect the server to be locked. Then we do the
-   request and check the answer completely. When smb_request_ok
-   returns 0, you can be quite sure that everything went well. When
-   the answer is <=0, the returned number is a valid unix errno. */
-static int
-smb_request_ok(struct smb_request *req, int command, int wct, int bcc)
-{
-        int result;
-        req->rq_resp_wct = wct;
-        req->rq_resp_bcc = bcc;
-        result = smb_add_request(req);
-        if (result != 0) {
-                DEBUG1("smb_request failed\n");
-                goto out;
-        }
-        if (smb_valid_packet(req->rq_header) != 0) {
-                PARANOIA("invalid packet!\n");
-                goto out;
-        }
-        result = smb_verify(req->rq_header, command, wct, bcc);
-out:
-        return result;
-}
-/*
- * This implements the NEWCONN ioctl. It installs the server pid,
- * sets server->state to CONN_VALID, and wakes up the waiting process.
- */
-int
-smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt)
-{
-        struct file *filp;
-        struct sock *sk;
-        int error;
-        VERBOSE("fd=%d, pid=%d\n", opt->fd, current->pid);
-        smb_lock_server(server);
-        /*
-         * Make sure we don't already have a valid connection ...
-         */
-        error = -EINVAL;
-        if (server->state == CONN_VALID)
-                goto out;
-        error = -EACCES;
-        if (current_uid() != server->mnt->mounted_uid &&
-            !capable(CAP_SYS_ADMIN))
-                goto out;
-        error = -EBADF;
-        filp = fget(opt->fd);
-        if (!filp)
-                goto out;
-        if (!smb_valid_socket(filp->f_path.dentry->d_inode))
-                goto out_putf;
-        server->sock_file = filp;
-        server->conn_pid = get_pid(task_pid(current));
-        server->opt = *opt;
-        server->generation += 1;
-        server->state = CONN_VALID;
-        error = 0;
-        if (server->conn_error) {
-                /*
-                 * conn_error is the returncode we originally decided to
-                 * drop the old connection on. This message should be positive
-                 * and not make people ask questions on why smbfs is printing
-                 * error messages ...
-                 */
-                printk(KERN_INFO "SMB connection re-established (%d)\n",
-                       server->conn_error);
-                server->conn_error = 0;
-        }
-        /*
-         * Store the server in sock user_data (Only used by sunrpc)
-         */
-        sk = SOCKET_I(filp->f_path.dentry->d_inode)->sk;
-        sk->sk_user_data = server;
-        /* chain into the data_ready callback */
-        server->data_ready = xchg(&sk->sk_data_ready, smb_data_ready);
-        /* check if we have an old smbmount that uses seconds for the 
-           serverzone */
-        if (server->opt.serverzone > 12*60 || server->opt.serverzone < -12*60)
-                server->opt.serverzone /= 60;
-        /* now that we have an established connection we can detect the server
-           type and enable bug workarounds */
-        if (server->opt.protocol < SMB_PROTOCOL_LANMAN2)
-                install_ops(server->ops, &smb_ops_core);
-        else if (server->opt.protocol == SMB_PROTOCOL_LANMAN2)
-                install_ops(server->ops, &smb_ops_os2);
-        else if (server->opt.protocol == SMB_PROTOCOL_NT1 &&
-                 (server->opt.max_xmit < 0x1000) &&
-                 !(server->opt.capabilities & SMB_CAP_NT_SMBS)) {
-                /* FIXME: can we kill the WIN95 flag now? */
-                server->mnt->flags |= SMB_MOUNT_WIN95;
-                VERBOSE("detected WIN95 server\n");
-                install_ops(server->ops, &smb_ops_win95);
-        } else {
-                /*
-                 * Samba has max_xmit 65535
-                 * NT4spX has max_xmit 4536 (or something like that)
-                 * win2k has ...
-                 */
-                VERBOSE("detected NT1 (Samba, NT4/5) server\n");
-                install_ops(server->ops, &smb_ops_winNT);
-        }
-        /* FIXME: the win9x code wants to modify these ... (seek/trunc bug) */
-        if (server->mnt->flags & SMB_MOUNT_OLDATTR) {
-                server->ops->getattr = smb_proc_getattr_core;
-        } else if (server->mnt->flags & SMB_MOUNT_DIRATTR) {
-                server->ops->getattr = smb_proc_getattr_ff;
-        }
-        /* Decode server capabilities */
-        if (server->opt.capabilities & SMB_CAP_LARGE_FILES) {
-                /* Should be ok to set this now, as no one can access the
-                   mount until the connection has been established. */
-                SB_of(server)->s_maxbytes = ~0ULL >> 1;
-                VERBOSE("LFS enabled\n");
-        }
-        if (server->opt.capabilities & SMB_CAP_UNICODE) {
-                server->mnt->flags |= SMB_MOUNT_UNICODE;
-                VERBOSE("Unicode enabled\n");
-        } else {
-                server->mnt->flags &= ~SMB_MOUNT_UNICODE;
-        }
-#if 0
-        /* flags we may test for other patches ... */
-        if (server->opt.capabilities & SMB_CAP_LARGE_READX) {
-                VERBOSE("Large reads enabled\n");
-        }
-        if (server->opt.capabilities & SMB_CAP_LARGE_WRITEX) {
-                VERBOSE("Large writes enabled\n");
-        }
-#endif
-        if (server->opt.capabilities & SMB_CAP_UNIX) {
-                struct inode *inode;
-                VERBOSE("Using UNIX CIFS extensions\n");
-                install_ops(server->ops, &smb_ops_unix);
-                inode = SB_of(server)->s_root->d_inode;
-                if (inode)
-                        inode->i_op = &smb_dir_inode_operations_unix;
-        }
-        VERBOSE("protocol=%d, max_xmit=%d, pid=%d capabilities=0x%x\n",
-                server->opt.protocol, server->opt.max_xmit,
-                pid_nr(server->conn_pid), server->opt.capabilities);
-        /* FIXME: this really should be done by smbmount. */
-        if (server->opt.max_xmit > SMB_MAX_PACKET_SIZE) {
-                server->opt.max_xmit = SMB_MAX_PACKET_SIZE;
-        }
-        smb_unlock_server(server);
-        smbiod_wake_up();
-        if (server->opt.capabilities & SMB_CAP_UNIX)
-                smb_proc_query_cifsunix(server);
-        server->conn_complete++;
-        wake_up_interruptible_all(&server->conn_wq);
-        return error;
-out:
-        smb_unlock_server(server);
-        smbiod_wake_up();
-        return error;
-out_putf:
-        fput(filp);
-        goto out;
-}
-/* smb_setup_header: We completely set up the packet. You only have to
-   insert the command-specific fields */
-__u8 *
-smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc)
-{
-        __u32 xmit_len = SMB_HEADER_LEN + wct * sizeof(__u16) + bcc + 2;
-        __u8 *p = req->rq_header;
-        struct smb_sb_info *server = req->rq_server;
-        p = smb_encode_smb_length(p, xmit_len - 4);
-        *p++ = 0xff;
-        *p++ = 'S';
-        *p++ = 'M';
-        *p++ = 'B';
-        *p++ = command;
-        memset(p, '\0', 19);
-        p += 19;
-        p += 8;
-        if (server->opt.protocol > SMB_PROTOCOL_CORE) {
-                int flags = SMB_FLAGS_CASELESS_PATHNAMES;
-                int flags2 = SMB_FLAGS2_LONG_PATH_COMPONENTS |
-                        SMB_FLAGS2_EXTENDED_ATTRIBUTES; /* EA? not really ... */
-                *(req->rq_header + smb_flg) = flags;
-                if (server->mnt->flags & SMB_MOUNT_UNICODE)
-                        flags2 |= SMB_FLAGS2_UNICODE_STRINGS;
-                WSET(req->rq_header, smb_flg2, flags2);
-        }
-        *p++ = wct;             /* wct */
-        p += 2 * wct;
-        WSET(p, 0, bcc);
-        /* Include the header in the data to send */
-        req->rq_iovlen = 1;
-        req->rq_iov[0].iov_base = req->rq_header;
-        req->rq_iov[0].iov_len  = xmit_len - bcc;
-        return req->rq_buffer;
-}
-static void
-smb_setup_bcc(struct smb_request *req, __u8 *p)
-{
-        u16 bcc = p - req->rq_buffer;
-        u8 *pbcc = req->rq_header + SMB_HEADER_LEN + 2*SMB_WCT(req->rq_header);
-        WSET(pbcc, 0, bcc);
-        smb_encode_smb_length(req->rq_header, SMB_HEADER_LEN + 
-                              2*SMB_WCT(req->rq_header) - 2 + bcc);
-        /* Include the "bytes" in the data to send */
-        req->rq_iovlen = 2;
-        req->rq_iov[1].iov_base = req->rq_buffer;
-        req->rq_iov[1].iov_len  = bcc;
-}
-static int
-smb_proc_seek(struct smb_sb_info *server, __u16 fileid,
-              __u16 mode, off_t offset)
-{
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBlseek, 4, 0);
-        WSET(req->rq_header, smb_vwv0, fileid);
-        WSET(req->rq_header, smb_vwv1, mode);
-        DSET(req->rq_header, smb_vwv2, offset);
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBlseek, 2, 0);
-        if (result < 0) {
-                result = 0;
-                goto out_free;
-        }
-        result = DVAL(req->rq_header, smb_vwv0);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_open(struct smb_sb_info *server, struct dentry *dentry, int wish)
-{
-        struct inode *ino = dentry->d_inode;
-        struct smb_inode_info *ei = SMB_I(ino);
-        int mode, read_write = 0x42, read_only = 0x40;
-        int res;
-        char *p;
-        struct smb_request *req;
-        /*
-         * Attempt to open r/w, unless there are no write privileges.
-         */
-        mode = read_write;
-        if (!(ino->i_mode & (S_IWUSR | S_IWGRP | S_IWOTH)))
-                mode = read_only;
-#if 0
-        /* FIXME: why is this code not in? below we fix it so that a caller
-           wanting RO doesn't get RW. smb_revalidate_inode does some 
-           optimization based on access mode. tail -f needs it to be correct.
-           We must open rw since we don't do the open if called a second time
-           with different 'wish'. Is that not supported by smb servers? */
-        if (!(wish & (O_WRONLY | O_RDWR)))
-                mode = read_only;
-#endif
-        res = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-      retry:
-        p = smb_setup_header(req, SMBopen, 2, 0);
-        WSET(req->rq_header, smb_vwv0, mode);
-        WSET(req->rq_header, smb_vwv1, aSYSTEM | aHIDDEN | aDIR);
-        res = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (res < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        res = smb_request_ok(req, SMBopen, 7, 0);
-        if (res != 0) {
-                if (mode == read_write &&
-                    (res == -EACCES || res == -ETXTBSY || res == -EROFS))
-                {
-                        VERBOSE("%s/%s R/W failed, error=%d, retrying R/O\n",
-                                DENTRY_PATH(dentry), res);
-                        mode = read_only;
-                        req->rq_flags = 0;
-                        goto retry;
-                }
-                goto out_free;
-        }
-        /* We should now have data in vwv[0..6]. */
-        ei->fileid = WVAL(req->rq_header, smb_vwv0);
-        ei->attr   = WVAL(req->rq_header, smb_vwv1);
-        /* smb_vwv2 has mtime */
-        /* smb_vwv4 has size  */
-        ei->access = (WVAL(req->rq_header, smb_vwv6) & SMB_ACCMASK);
-        ei->open = server->generation;
-out_free:
-        smb_rput(req);
-out:
-        return res;
-}
-/*
- * Make sure the file is open, and check that the access
- * is compatible with the desired access.
- */
-int
-smb_open(struct dentry *dentry, int wish)
-{
-        struct inode *inode = dentry->d_inode;
-        int result;
-        __u16 access;
-        result = -ENOENT;
-        if (!inode) {
-                printk(KERN_ERR "smb_open: no inode for dentry %s/%s\n",
-                       DENTRY_PATH(dentry));
-                goto out;
-        }
-        if (!smb_is_open(inode)) {
-                struct smb_sb_info *server = server_from_inode(inode);
-                result = 0;
-                if (!smb_is_open(inode))
-                        result = smb_proc_open(server, dentry, wish);
-                if (result)
-                        goto out;
-                /*
-                 * A successful open means the path is still valid ...
-                 */
-                smb_renew_times(dentry);
-        }
-        /*
-         * Check whether the access is compatible with the desired mode.
-         */
-        result = 0;
-        access = SMB_I(inode)->access;
-        if (access != wish && access != SMB_O_RDWR) {
-                PARANOIA("%s/%s access denied, access=%x, wish=%x\n",
-                         DENTRY_PATH(dentry), access, wish);
-                result = -EACCES;
-        }
-out:
-        return result;
-}
-static int 
-smb_proc_close(struct smb_sb_info *server, __u16 fileid, __u32 mtime)
-{
-        struct smb_request *req;
-        int result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBclose, 3, 0);
-        WSET(req->rq_header, smb_vwv0, fileid);
-        DSET(req->rq_header, smb_vwv1, utc2local(server, mtime));
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBclose, 0, 0);
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Win NT 4.0 has an apparent bug in that it fails to update the
- * modify time when writing to a file. As a workaround, we update
- * both modify and access time locally, and post the times to the
- * server when closing the file.
- */
-static int 
-smb_proc_close_inode(struct smb_sb_info *server, struct inode * ino)
-{
-        struct smb_inode_info *ei = SMB_I(ino);
-        int result = 0;
-        if (smb_is_open(ino))
-        {
-                /*
-                 * We clear the open flag in advance, in case another
-                 * process observes the value while we block below.
-                 */
-                ei->open = 0;
-                /*
-                 * Kludge alert: SMB timestamps are accurate only to
-                 * two seconds ... round the times to avoid needless
-                 * cache invalidations!
-                 */
-                if (ino->i_mtime.tv_sec & 1) { 
-                        ino->i_mtime.tv_sec--;
-                        ino->i_mtime.tv_nsec = 0; 
-                }
-                if (ino->i_atime.tv_sec & 1) {
-                        ino->i_atime.tv_sec--;
-                        ino->i_atime.tv_nsec = 0;
-                }
-                /*
-                 * If the file is open with write permissions,
-                 * update the time stamps to sync mtime and atime.
-                 */
-                if ((server->opt.capabilities & SMB_CAP_UNIX) == 0 &&
-                    (server->opt.protocol >= SMB_PROTOCOL_LANMAN2) &&
-                    !(ei->access == SMB_O_RDONLY))
-                {
-                        struct smb_fattr fattr;
-                        smb_get_inode_attr(ino, &fattr);
-                        smb_proc_setattr_ext(server, ino, &fattr);
-                }
-                result = smb_proc_close(server, ei->fileid, ino->i_mtime.tv_sec);
-                /*
-                 * Force a revalidation after closing ... some servers
-                 * don't post the size until the file has been closed.
-                 */
-                if (server->opt.protocol < SMB_PROTOCOL_NT1)
-                        ei->oldmtime = 0;
-                ei->closed = jiffies;
-        }
-        return result;
-}
-int
-smb_close(struct inode *ino)
-{
-        int result = 0;
-        if (smb_is_open(ino)) {
-                struct smb_sb_info *server = server_from_inode(ino);
-                result = smb_proc_close_inode(server, ino);
-        }
-        return result;
-}
-/*
- * This is used to close a file following a failed instantiate.
- * Since we don't have an inode, we can't use any of the above.
- */
-int
-smb_close_fileid(struct dentry *dentry, __u16 fileid)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        int result;
-        result = smb_proc_close(server, fileid, get_seconds());
-        return result;
-}
-/* In smb_proc_read and smb_proc_write we do not retry, because the
-   file-id would not be valid after a reconnection. */
-static void
-smb_proc_read_data(struct smb_request *req)
-{
-        req->rq_iov[0].iov_base = req->rq_buffer;
-        req->rq_iov[0].iov_len  = 3;
-        req->rq_iov[1].iov_base = req->rq_page;
-        req->rq_iov[1].iov_len  = req->rq_rsize;
-        req->rq_iovlen = 2;
-        req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-}
-static int
-smb_proc_read(struct inode *inode, loff_t offset, int count, char *data)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        __u16 returned_count, data_len;
-        unsigned char *buf;
-        int result;
-        struct smb_request *req;
-        u8 rbuf[4];
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBread, 5, 0);
-        buf = req->rq_header;
-        WSET(buf, smb_vwv0, SMB_I(inode)->fileid);
-        WSET(buf, smb_vwv1, count);
-        DSET(buf, smb_vwv2, offset);
-        WSET(buf, smb_vwv4, 0);
-        req->rq_page = data;
-        req->rq_rsize = count;
-        req->rq_callback = smb_proc_read_data;
-        req->rq_buffer = rbuf;
-        req->rq_flags |= SMB_REQ_NORETRY | SMB_REQ_STATIC;
-        result = smb_request_ok(req, SMBread, 5, -1);
-        if (result < 0)
-                goto out_free;
-        returned_count = WVAL(req->rq_header, smb_vwv0);
-        data_len = WVAL(rbuf, 1);
-        if (returned_count != data_len) {
-                printk(KERN_NOTICE "smb_proc_read: returned != data_len\n");
-                printk(KERN_NOTICE "smb_proc_read: ret_c=%d, data_len=%d\n",
-                       returned_count, data_len);
-        }
-        result = data_len;
-out_free:
-        smb_rput(req);
-out:
-        VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
-                inode->i_ino, SMB_I(inode)->fileid, count, result);
-        return result;
-}
-static int
-smb_proc_write(struct inode *inode, loff_t offset, int count, const char *data)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        int result;
-        u16 fileid = SMB_I(inode)->fileid;
-        u8 buf[4];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
-                inode->i_ino, fileid, count, offset);
-        smb_setup_header(req, SMBwrite, 5, count + 3);
-        WSET(req->rq_header, smb_vwv0, fileid);
-        WSET(req->rq_header, smb_vwv1, count);
-        DSET(req->rq_header, smb_vwv2, offset);
-        WSET(req->rq_header, smb_vwv4, 0);
-        buf[0] = 1;
-        WSET(buf, 1, count);    /* yes, again ... */
-        req->rq_iov[1].iov_base = buf;
-        req->rq_iov[1].iov_len = 3;
-        req->rq_iov[2].iov_base = (char *) data;
-        req->rq_iov[2].iov_len = count;
-        req->rq_iovlen = 3;
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBwrite, 1, 0);
-        if (result >= 0)
-                result = WVAL(req->rq_header, smb_vwv0);
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * In smb_proc_readX and smb_proc_writeX we do not retry, because the
- * file-id would not be valid after a reconnection.
- */
-#define SMB_READX_MAX_PAD      64
-static void
-smb_proc_readX_data(struct smb_request *req)
-{
-        /* header length, excluding the netbios length (-4) */
-        int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
-        int data_off = WVAL(req->rq_header, smb_vwv6);
-        /*
-         * Some genius made the padding to the data bytes arbitrary.
-         * So we must first calculate the amount of padding used by the server.
-         */
-        data_off -= hdrlen;
-        if (data_off > SMB_READX_MAX_PAD || data_off < 0) {
-                PARANOIA("offset is larger than SMB_READX_MAX_PAD or negative!\n");
-                PARANOIA("%d > %d || %d < 0\n", data_off, SMB_READX_MAX_PAD, data_off);
-                req->rq_rlen = req->rq_bufsize + 1;
-                return;
-        }
-        req->rq_iov[0].iov_base = req->rq_buffer;
-        req->rq_iov[0].iov_len  = data_off;
-        req->rq_iov[1].iov_base = req->rq_page;
-        req->rq_iov[1].iov_len  = req->rq_rsize;
-        req->rq_iovlen = 2;
-        req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-}
-static int
-smb_proc_readX(struct inode *inode, loff_t offset, int count, char *data)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        unsigned char *buf;
-        int result;
-        struct smb_request *req;
-        static char pad[SMB_READX_MAX_PAD];
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBreadX, 12, 0);
-        buf = req->rq_header;
-        WSET(buf, smb_vwv0, 0x00ff);
-        WSET(buf, smb_vwv1, 0);
-        WSET(buf, smb_vwv2, SMB_I(inode)->fileid);
-        DSET(buf, smb_vwv3, (u32)offset);               /* low 32 bits */
-        WSET(buf, smb_vwv5, count);
-        WSET(buf, smb_vwv6, 0);
-        DSET(buf, smb_vwv7, 0);
-        WSET(buf, smb_vwv9, 0);
-        DSET(buf, smb_vwv10, (u32)(offset >> 32));      /* high 32 bits */
-        WSET(buf, smb_vwv11, 0);
-        req->rq_page = data;
-        req->rq_rsize = count;
-        req->rq_callback = smb_proc_readX_data;
-        req->rq_buffer = pad;
-        req->rq_bufsize = SMB_READX_MAX_PAD;
-        req->rq_flags |= SMB_REQ_STATIC | SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBreadX, 12, -1);
-        if (result < 0)
-                goto out_free;
-        result = WVAL(req->rq_header, smb_vwv5);
-out_free:
-        smb_rput(req);
-out:
-        VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
-                inode->i_ino, SMB_I(inode)->fileid, count, result);
-        return result;
-}
-static int
-smb_proc_writeX(struct inode *inode, loff_t offset, int count, const char *data)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        int result;
-        u8 *p;
-        static u8 pad[4];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
-                inode->i_ino, SMB_I(inode)->fileid, count, offset);
-        p = smb_setup_header(req, SMBwriteX, 14, count + 1);
-        WSET(req->rq_header, smb_vwv0, 0x00ff);
-        WSET(req->rq_header, smb_vwv1, 0);
-        WSET(req->rq_header, smb_vwv2, SMB_I(inode)->fileid);
-        DSET(req->rq_header, smb_vwv3, (u32)offset);    /* low 32 bits */
-        DSET(req->rq_header, smb_vwv5, 0);
-        WSET(req->rq_header, smb_vwv7, 0);              /* write mode */
-        WSET(req->rq_header, smb_vwv8, 0);
-        WSET(req->rq_header, smb_vwv9, 0);
-        WSET(req->rq_header, smb_vwv10, count);         /* data length */
-        WSET(req->rq_header, smb_vwv11, smb_vwv12 + 2 + 1);
-        DSET(req->rq_header, smb_vwv12, (u32)(offset >> 32));
-        req->rq_iov[1].iov_base = pad;
-        req->rq_iov[1].iov_len = 1;
-        req->rq_iov[2].iov_base = (char *) data;
-        req->rq_iov[2].iov_len = count;
-        req->rq_iovlen = 3;
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBwriteX, 6, 0);
-        if (result >= 0)
-                result = WVAL(req->rq_header, smb_vwv2);
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, SMBcreate, 3, 0);
-        WSET(req->rq_header, smb_vwv0, attr);
-        DSET(req->rq_header, smb_vwv1, utc2local(server, ctime));
-        result = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        result = smb_request_ok(req, SMBcreate, 1, 0);
-        if (result < 0)
-                goto out_free;
-        *fileid = WVAL(req->rq_header, smb_vwv0);
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry)
-{
-        struct smb_sb_info *server = server_from_dentry(old_dentry);
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, SMBmv, 1, 0);
-        WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN | aDIR);
-        result = smb_simple_encode_path(req, &p, old_dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        result = smb_simple_encode_path(req, &p, new_dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        if ((result = smb_request_ok(req, SMBmv, 0, 0)) < 0)
-                goto out_free;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Code common to mkdir and rmdir.
- */
-static int
-smb_proc_generic_command(struct dentry *dentry, __u8 command)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, command, 0, 0);
-        result = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        result = smb_request_ok(req, command, 0, 0);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_mkdir(struct dentry *dentry)
-{
-        return smb_proc_generic_command(dentry, SMBmkdir);
-}
-int
-smb_proc_rmdir(struct dentry *dentry)
-{
-        return smb_proc_generic_command(dentry, SMBrmdir);
-}
-#if SMBFS_POSIX_UNLINK
-/*
- * Removes readonly attribute from a file. Used by unlink to give posix
- * semantics.
- */
-static int
-smb_set_rw(struct dentry *dentry,struct smb_sb_info *server)
-{
-        int result;
-        struct smb_fattr fattr;
-        /* FIXME: cifsUE should allow removing a readonly file. */
-        /* first get current attribute */
-        smb_init_dirent(server, &fattr);
-        result = server->ops->getattr(server, dentry, &fattr);
-        smb_finish_dirent(server, &fattr);
-        if (result < 0)
-                return result;
-        /* if RONLY attribute is set, remove it */
-        if (fattr.attr & aRONLY) {  /* read only attribute is set */
-                fattr.attr &= ~aRONLY;
-                result = smb_proc_setattr_core(server, dentry, fattr.attr);
-        }
-        return result;
-}
-#endif
-int
-smb_proc_unlink(struct dentry *dentry)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        int flag = 0;
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-      retry:
-        p = smb_setup_header(req, SMBunlink, 1, 0);
-        WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN);
-        result = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        if ((result = smb_request_ok(req, SMBunlink, 0, 0)) < 0) {
-#if SMBFS_POSIX_UNLINK
-                if (result == -EACCES && !flag) {
-                        /* Posix semantics is for the read-only state
-                           of a file to be ignored in unlink(). In the
-                           SMB world a unlink() is refused on a
-                           read-only file. To make things easier for
-                           unix users we try to override the files
-                           permission if the unlink fails with the
-                           right error.
-                           This introduces a race condition that could
-                           lead to a file being written by someone who
-                           shouldn't have access, but as far as I can
-                           tell that is unavoidable */
-                        /* remove RONLY attribute and try again */
-                        result = smb_set_rw(dentry,server);
-                        if (result == 0) {
-                                flag = 1;
-                                req->rq_flags = 0;
-                                goto retry;
-                        }
-                }
-#endif
-                goto out_free;
-        }
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_flush(struct smb_sb_info *server, __u16 fileid)
-{
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBflush, 1, 0);
-        WSET(req->rq_header, smb_vwv0, fileid);
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBflush, 0, 0);
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_trunc32(struct inode *inode, loff_t length)
-{
-        /*
-         * Writing 0bytes is old-SMB magic for truncating files.
-         * MAX_NON_LFS should prevent this from being called with a too
-         * large offset.
-         */
-        return smb_proc_write(inode, length, 0, NULL);
-}
-static int
-smb_proc_trunc64(struct inode *inode, loff_t length)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        int result;
-        char *param;
-        char *data;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 14)))
-                goto out;
-        param = req->rq_buffer;
-        data = req->rq_buffer + 6;
-        /* FIXME: must we also set allocation size? winNT seems to do that */
-        WSET(param, 0, SMB_I(inode)->fileid);
-        WSET(param, 2, SMB_SET_FILE_END_OF_FILE_INFO);
-        WSET(param, 4, 0);
-        LSET(data, 0, length);
-        req->rq_trans2_command = TRANSACT2_SETFILEINFO;
-        req->rq_ldata = 8;
-        req->rq_data  = data;
-        req->rq_lparm = 6;
-        req->rq_parm  = param;
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-        if (req->rq_rcls != 0)
-                result = smb_errno(req);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_trunc95(struct inode *inode, loff_t length)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        int result = smb_proc_trunc32(inode, length);
- 
-        /*
-         * win9x doesn't appear to update the size immediately.
-         * It will return the old file size after the truncate,
-         * confusing smbfs. So we force an update.
-         *
-         * FIXME: is this still necessary?
-         */
-        smb_proc_flush(server, SMB_I(inode)->fileid);
-        return result;
-}
-static void
-smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
-{
-        memset(fattr, 0, sizeof(*fattr));
-        fattr->f_nlink = 1;
-        fattr->f_uid = server->mnt->uid;
-        fattr->f_gid = server->mnt->gid;
-        fattr->f_unix = 0;
-}
-static void
-smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
-{
-        if (fattr->f_unix)
-                return;
-        fattr->f_mode = server->mnt->file_mode;
-        if (fattr->attr & aDIR) {
-                fattr->f_mode = server->mnt->dir_mode;
-                fattr->f_size = SMB_ST_BLKSIZE;
-        }
-        /* Check the read-only flag */
-        if (fattr->attr & aRONLY)
-                fattr->f_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH);
-        /* How many 512 byte blocks do we need for this file? */
-        fattr->f_blocks = 0;
-        if (fattr->f_size != 0)
-                fattr->f_blocks = 1 + ((fattr->f_size-1) >> 9);
-        return;
-}
-void
-smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
-                     struct super_block *sb)
-{
-        smb_init_dirent(server, fattr);
-        fattr->attr = aDIR;
-        fattr->f_ino = 2; /* traditional root inode number */
-        fattr->f_mtime = current_fs_time(sb);
-        smb_finish_dirent(server, fattr);
-}
-/*
- * Decode a dirent for old protocols
- *
- * qname is filled with the decoded, and possibly translated, name.
- * fattr receives decoded attributes
- *
- * Bugs Noted:
- * (1) Pathworks servers may pad the name with extra spaces.
- */
-static char *
-smb_decode_short_dirent(struct smb_sb_info *server, char *p,
-                        struct qstr *qname, struct smb_fattr *fattr,
-                        unsigned char *name_buf)
-{
-        int len;
-        /*
-         * SMB doesn't have a concept of inode numbers ...
-         */
-        smb_init_dirent(server, fattr);
-        fattr->f_ino = 0;       /* FIXME: do we need this? */
-        p += SMB_STATUS_SIZE;   /* reserved (search_status) */
-        fattr->attr = *p;
-        fattr->f_mtime.tv_sec = date_dos2unix(server, WVAL(p, 3), WVAL(p, 1));
-        fattr->f_mtime.tv_nsec = 0;
-        fattr->f_size = DVAL(p, 5);
-        fattr->f_ctime = fattr->f_mtime;
-        fattr->f_atime = fattr->f_mtime;
-        qname->name = p + 9;
-        len = strnlen(qname->name, 12);
-        /*
-         * Trim trailing blanks for Pathworks servers
-         */
-        while (len > 2 && qname->name[len-1] == ' ')
-                len--;
-        smb_finish_dirent(server, fattr);
-#if 0
-        /* FIXME: These only work for ascii chars, and recent smbmount doesn't
-           allow the flag to be set anyway. It kills const. Remove? */
-        switch (server->opt.case_handling) {
-        case SMB_CASE_UPPER:
-                str_upper(entry->name, len);
-                break;
-        case SMB_CASE_LOWER:
-                str_lower(entry->name, len);
-                break;
-        default:
-                break;
-        }
-#endif
-        qname->len = 0;
-        len = server->ops->convert(name_buf, SMB_MAXNAMELEN,
-                                   qname->name, len,
-                                   server->remote_nls, server->local_nls);
-        if (len > 0) {
-                qname->len = len;
-                qname->name = name_buf;
-                DEBUG1("len=%d, name=%.*s\n",qname->len,qname->len,qname->name);
-        }
-        return p + 22;
-}
-/*
- * This routine is used to read in directory entries from the network.
- * Note that it is for short directory name seeks, i.e.: protocol <
- * SMB_PROTOCOL_LANMAN2
- */
-static int
-smb_proc_readdir_short(struct file *filp, void *dirent, filldir_t filldir,
-                       struct smb_cache_control *ctl)
-{
-        struct dentry *dir = filp->f_path.dentry;
-        struct smb_sb_info *server = server_from_dentry(dir);
-        struct qstr qname;
-        struct smb_fattr fattr;
-        char *p;
-        int result;
-        int i, first, entries_seen, entries;
-        int entries_asked = (server->opt.max_xmit - 100) / SMB_DIRINFO_SIZE;
-        __u16 bcc;
-        __u16 count;
-        char status[SMB_STATUS_SIZE];
-        static struct qstr mask = {
-                .name   = "*.*",
-                .len    = 3,
-        };
-        unsigned char *last_status;
-        struct smb_request *req;
-        unsigned char *name_buf;
-        VERBOSE("%s/%s\n", DENTRY_PATH(dir));
-        lock_kernel();
-        result = -ENOMEM;
-        if (! (name_buf = kmalloc(SMB_MAXNAMELEN, GFP_KERNEL)))
-                goto out;
-        first = 1;
-        entries = 0;
-        entries_seen = 2; /* implicit . and .. */
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
-                goto out_name;
-        while (1) {
-                p = smb_setup_header(req, SMBsearch, 2, 0);
-                WSET(req->rq_header, smb_vwv0, entries_asked);
-                WSET(req->rq_header, smb_vwv1, aDIR);
-                if (first == 1) {
-                        result = smb_simple_encode_path(req, &p, dir, &mask);
-                        if (result < 0)
-                                goto out_free;
-                        if (p + 3 > (char *)req->rq_buffer + req->rq_bufsize) {
-                                result = -ENAMETOOLONG;
-                                goto out_free;
-                        }
-                        *p++ = 5;
-                        WSET(p, 0, 0);
-                        p += 2;
-                        first = 0;
-                } else {
-                        if (p + 5 + SMB_STATUS_SIZE >
-                            (char *)req->rq_buffer + req->rq_bufsize) {
-                                result = -ENAMETOOLONG;
-                                goto out_free;
-                        }
-                                
-                        *p++ = 4;
-                        *p++ = 0;
-                        *p++ = 5;
-                        WSET(p, 0, SMB_STATUS_SIZE);
-                        p += 2;
-                        memcpy(p, status, SMB_STATUS_SIZE);
-                        p += SMB_STATUS_SIZE;
-                }
-                smb_setup_bcc(req, p);
-                result = smb_request_ok(req, SMBsearch, 1, -1);
-                if (result < 0) {
-                        if ((req->rq_rcls == ERRDOS) && 
-                            (req->rq_err  == ERRnofiles))
-                                break;
-                        goto out_free;
-                }
-                count = WVAL(req->rq_header, smb_vwv0);
-                if (count <= 0)
-                        break;
-                result = -EIO;
-                bcc = smb_bcc(req->rq_header);
-                if (bcc != count * SMB_DIRINFO_SIZE + 3)
-                        goto out_free;
-                p = req->rq_buffer + 3;
-                /* Make sure the response fits in the buffer. Fixed sized 
-                   entries means we don't have to check in the decode loop. */
-                last_status = req->rq_buffer + 3 + (count-1) * SMB_DIRINFO_SIZE;
-                if (last_status + SMB_DIRINFO_SIZE >=
-                    req->rq_buffer + req->rq_bufsize) {
-                        printk(KERN_ERR "smb_proc_readdir_short: "
-                               "last dir entry outside buffer! "
-                               "%d@%p  %d@%p\n", SMB_DIRINFO_SIZE, last_status,
-                               req->rq_bufsize, req->rq_buffer);
-                        goto out_free;
-                }
-                /* Read the last entry into the status field. */
-                memcpy(status, last_status, SMB_STATUS_SIZE);
-                /* Now we are ready to parse smb directory entries. */
-                for (i = 0; i < count; i++) {
-                        p = smb_decode_short_dirent(server, p, 
-                                                    &qname, &fattr, name_buf);
-                        if (qname.len == 0)
-                                continue;
-                        if (entries_seen == 2 && qname.name[0] == '.') {
-                                if (qname.len == 1)
-                                        continue;
-                                if (qname.name[1] == '.' && qname.len == 2)
-                                        continue;
-                        }
-                        if (!smb_fill_cache(filp, dirent, filldir, ctl, 
-                                            &qname, &fattr))
-                                ;       /* stop reading? */
-                        entries_seen++;
-                }
-        }
-        result = entries;
-out_free:
-        smb_rput(req);
-out_name:
-        kfree(name_buf);
-out:
-        unlock_kernel();
-        return result;
-}
-static void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p)
-{
-        u64 size, disk_bytes;
-        /* FIXME: verify nls support. all is sent as utf8? */
-        fattr->f_unix = 1;
-        fattr->f_mode = 0;
-        /* FIXME: use the uniqueID from the remote instead? */
-        /* 0 L file size in bytes */
-        /* 8 L file size on disk in bytes (block count) */
-        /* 40 L uid */
-        /* 48 L gid */
-        /* 56 W file type */
-        /* 60 L devmajor */
-        /* 68 L devminor */
-        /* 76 L unique ID (inode) */
-        /* 84 L permissions */
-        /* 92 L link count */
-        size = LVAL(p, 0);
-        disk_bytes = LVAL(p, 8);
-        /*
-         * Some samba versions round up on-disk byte usage
-         * to 1MB boundaries, making it useless. When seeing
-         * that, use the size instead.
-         */
-        if (!(disk_bytes & 0xfffff))
-                disk_bytes = size+511;
-        fattr->f_size = size;
-        fattr->f_blocks = disk_bytes >> 9;
-        fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 16));
-        fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 24));
-        fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 32));
-        if (server->mnt->flags & SMB_MOUNT_UID)
-                fattr->f_uid = server->mnt->uid;
-        else
-                fattr->f_uid = LVAL(p, 40);
-        if (server->mnt->flags & SMB_MOUNT_GID)
-                fattr->f_gid = server->mnt->gid;
-        else
-                fattr->f_gid = LVAL(p, 48);
-        fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56));
-        if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) {
-                __u64 major = LVAL(p, 60);
-                __u64 minor = LVAL(p, 68);
-                fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff);
-                if (MAJOR(fattr->f_rdev) != (major & 0xffffffff) ||
-                MINOR(fattr->f_rdev) != (minor & 0xffffffff))
-                        fattr->f_rdev = 0;
-        }
-        fattr->f_mode |= LVAL(p, 84);
-        if ( (server->mnt->flags & SMB_MOUNT_DMODE) &&
-             (S_ISDIR(fattr->f_mode)) )
-                fattr->f_mode = (server->mnt->dir_mode & S_IRWXUGO) | S_IFDIR;
-        else if ( (server->mnt->flags & SMB_MOUNT_FMODE) &&
-                  !(S_ISDIR(fattr->f_mode)) )
-                fattr->f_mode = (server->mnt->file_mode & S_IRWXUGO) |
-                                (fattr->f_mode & S_IFMT);
-}
-/*
- * Interpret a long filename structure using the specified info level:
- *   level 1 for anything below NT1 protocol
- *   level 260 for NT1 protocol
- *
- * qname is filled with the decoded, and possibly translated, name
- * fattr receives decoded attributes.
- *
- * Bugs Noted:
- * (1) Win NT 4.0 appends a null byte to names and counts it in the length!
- */
-static char *
-smb_decode_long_dirent(struct smb_sb_info *server, char *p, int level,
-                       struct qstr *qname, struct smb_fattr *fattr,
-                       unsigned char *name_buf)
-{
-        char *result;
-        unsigned int len = 0;
-        int n;
-        __u16 date, time;
-        int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
-        /*
-         * SMB doesn't have a concept of inode numbers ...
-         */
-        smb_init_dirent(server, fattr);
-        fattr->f_ino = 0;       /* FIXME: do we need this? */
-        switch (level) {
-        case 1:
-                len = *((unsigned char *) p + 22);
-                qname->name = p + 23;
-                result = p + 24 + len;
-                date = WVAL(p, 0);
-                time = WVAL(p, 2);
-                fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-                fattr->f_ctime.tv_nsec = 0;
-                date = WVAL(p, 4);
-                time = WVAL(p, 6);
-                fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
-                fattr->f_atime.tv_nsec = 0;
-                date = WVAL(p, 8);
-                time = WVAL(p, 10);
-                fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-                fattr->f_mtime.tv_nsec = 0;
-                fattr->f_size = DVAL(p, 12);
-                /* ULONG allocation size */
-                fattr->attr = WVAL(p, 20);
-                VERBOSE("info 1 at %p, len=%d, name=%.*s\n",
-                        p, len, len, qname->name);
-                break;
-        case 260:
-                result = p + WVAL(p, 0);
-                len = DVAL(p, 60);
-                if (len > 255) len = 255;
-                /* NT4 null terminates, unless we are using unicode ... */
-                qname->name = p + 94;
-                if (!unicode && len && qname->name[len-1] == '\0')
-                        len--;
-                fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 8));
-                fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 16));
-                fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 24));
-                /* change time (32) */
-                fattr->f_size = LVAL(p, 40);
-                /* alloc size (48) */
-                fattr->attr = DVAL(p, 56);
-                VERBOSE("info 260 at %p, len=%d, name=%.*s\n",
-                        p, len, len, qname->name);
-                break;
-        case SMB_FIND_FILE_UNIX:
-                result = p + WVAL(p, 0);
-                qname->name = p + 108;
-                len = strlen(qname->name);
-                /* FIXME: should we check the length?? */
-                p += 8;
-                smb_decode_unix_basic(fattr, server, p);
-                VERBOSE("info SMB_FIND_FILE_UNIX at %p, len=%d, name=%.*s\n",
-                        p, len, len, qname->name);
-                break;
-        default:
-                PARANOIA("Unknown info level %d\n", level);
-                result = p + WVAL(p, 0);
-                goto out;
-        }
-        smb_finish_dirent(server, fattr);
-#if 0
-        /* FIXME: These only work for ascii chars, and recent smbmount doesn't
-           allow the flag to be set anyway. Remove? */
-        switch (server->opt.case_handling) {
-        case SMB_CASE_UPPER:
-                str_upper(qname->name, len);
-                break;
-        case SMB_CASE_LOWER:
-                str_lower(qname->name, len);
-                break;
-        default:
-                break;
-        }
-#endif
-        qname->len = 0;
-        n = server->ops->convert(name_buf, SMB_MAXNAMELEN,
-                                 qname->name, len,
-                                 server->remote_nls, server->local_nls);
-        if (n > 0) {
-                qname->len = n;
-                qname->name = name_buf;
-        }
-out:
-        return result;
-}
-/* findfirst/findnext flags */
-#define SMB_CLOSE_AFTER_FIRST (1<<0)
-#define SMB_CLOSE_IF_END (1<<1)
-#define SMB_REQUIRE_RESUME_KEY (1<<2)
-#define SMB_CONTINUE_BIT (1<<3)
-/*
- * Note: samba-2.0.7 (at least) has a very similar routine, cli_list, in
- * source/libsmb/clilist.c. When looking for smb bugs in the readdir code,
- * go there for advise.
- *
- * Bugs Noted:
- * (1) When using Info Level 1 Win NT 4.0 truncates directory listings 
- * for certain patterns of names and/or lengths. The breakage pattern
- * is completely reproducible and can be toggled by the creation of a
- * single file. (E.g. echo hi >foo breaks, rm -f foo works.)
- */
-static int
-smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir,
-                      struct smb_cache_control *ctl)
-{
-        struct dentry *dir = filp->f_path.dentry;
-        struct smb_sb_info *server = server_from_dentry(dir);
-        struct qstr qname;
-        struct smb_fattr fattr;
-        unsigned char *p, *lastname;
-        char *mask, *param;
-        __u16 command;
-        int first, entries_seen;
-        /* Both NT and OS/2 accept info level 1 (but see note below). */
-        int info_level = 260;
-        const int max_matches = 512;
-        unsigned int ff_searchcount = 0;
-        unsigned int ff_eos = 0;
-        unsigned int ff_lastname = 0;
-        unsigned int ff_dir_handle = 0;
-        unsigned int loop_count = 0;
-        unsigned int mask_len, i;
-        int result;
-        struct smb_request *req;
-        unsigned char *name_buf;
-        static struct qstr star = {
-                .name   = "*",
-                .len    = 1,
-        };
-        lock_kernel();
-        /*
-         * We always prefer unix style. Use info level 1 for older
-         * servers that don't do 260.
-         */
-        if (server->opt.capabilities & SMB_CAP_UNIX)
-                info_level = SMB_FIND_FILE_UNIX;
-        else if (server->opt.protocol < SMB_PROTOCOL_NT1)
-                info_level = 1;
-        result = -ENOMEM;
-        if (! (name_buf = kmalloc(SMB_MAXNAMELEN+2, GFP_KERNEL)))
-                goto out;
-        if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
-                goto out_name;
-        param = req->rq_buffer;
-        /*
-         * Encode the initial path
-         */
-        mask = param + 12;
-        result = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dir, &star);
-        if (result <= 0)
-                goto out_free;
-        mask_len = result - 1;  /* mask_len is strlen, not #bytes */
-        result = 0;
-        first = 1;
-        VERBOSE("starting mask_len=%d, mask=%s\n", mask_len, mask);
-        entries_seen = 2;
-        ff_eos = 0;
-        while (ff_eos == 0) {
-                loop_count += 1;
-                if (loop_count > 10) {
-                        printk(KERN_WARNING "smb_proc_readdir_long: "
-                               "Looping in FIND_NEXT??\n");
-                        result = -EIO;
-                        break;
-                }
-                if (first != 0) {
-                        command = TRANSACT2_FINDFIRST;
-                        WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
-                        WSET(param, 2, max_matches);    /* max count */
-                        WSET(param, 4, SMB_CLOSE_IF_END);
-                        WSET(param, 6, info_level);
-                        DSET(param, 8, 0);
-                } else {
-                        command = TRANSACT2_FINDNEXT;
-                        VERBOSE("handle=0x%X, lastname=%d, mask=%.*s\n",
-                                ff_dir_handle, ff_lastname, mask_len, mask);
-                        WSET(param, 0, ff_dir_handle);  /* search handle */
-                        WSET(param, 2, max_matches);    /* max count */
-                        WSET(param, 4, info_level);
-                        DSET(param, 6, 0);
-                        WSET(param, 10, SMB_CONTINUE_BIT|SMB_CLOSE_IF_END);
-                }
-                req->rq_trans2_command = command;
-                req->rq_ldata = 0;
-                req->rq_data  = NULL;
-                req->rq_lparm = 12 + mask_len + 1;
-                req->rq_parm  = param;
-                req->rq_flags = 0;
-                result = smb_add_request(req);
-                if (result < 0) {
-                        PARANOIA("error=%d, breaking\n", result);
-                        break;
-                }
-                if (req->rq_rcls == ERRSRV && req->rq_err == ERRerror) {
-                        /* a damn Win95 bug - sometimes it clags if you 
-                           ask it too fast */
-                        schedule_timeout_interruptible(msecs_to_jiffies(200));
-                        continue;
-                }
-                if (req->rq_rcls != 0) {
-                        result = smb_errno(req);
-                        PARANOIA("name=%s, result=%d, rcls=%d, err=%d\n",
-                                 mask, result, req->rq_rcls, req->rq_err);
-                        break;
-                }
-                /* parse out some important return info */
-                if (first != 0) {
-                        ff_dir_handle = WVAL(req->rq_parm, 0);
-                        ff_searchcount = WVAL(req->rq_parm, 2);
-                        ff_eos = WVAL(req->rq_parm, 4);
-                        ff_lastname = WVAL(req->rq_parm, 8);
-                } else {
-                        ff_searchcount = WVAL(req->rq_parm, 0);
-                        ff_eos = WVAL(req->rq_parm, 2);
-                        ff_lastname = WVAL(req->rq_parm, 6);
-                }
-                if (ff_searchcount == 0)
-                        break;
-                /* Now we are ready to parse smb directory entries. */
-                /* point to the data bytes */
-                p = req->rq_data;
-                for (i = 0; i < ff_searchcount; i++) {
-                        /* make sure we stay within the buffer */
-                        if (p >= req->rq_data + req->rq_ldata) {
-                                printk(KERN_ERR "smb_proc_readdir_long: "
-                                       "dirent pointer outside buffer! "
-                                       "%p  %d@%p\n",
-                                       p, req->rq_ldata, req->rq_data);
-                                result = -EIO; /* always a comm. error? */
-                                goto out_free;
-                        }
-                        p = smb_decode_long_dirent(server, p, info_level,
-                                                   &qname, &fattr, name_buf);
-                        /* ignore . and .. from the server */
-                        if (entries_seen == 2 && qname.name[0] == '.') {
-                                if (qname.len == 1)
-                                        continue;
-                                if (qname.name[1] == '.' && qname.len == 2)
-                                        continue;
-                        }
-                        if (!smb_fill_cache(filp, dirent, filldir, ctl, 
-                                            &qname, &fattr))
-                                ;       /* stop reading? */
-                        entries_seen++;
-                }
-                VERBOSE("received %d entries, eos=%d\n", ff_searchcount,ff_eos);
-                /*
-                 * We might need the lastname for continuations.
-                 *
-                 * Note that some servers (win95?) point to the filename and
-                 * others (NT4, Samba using NT1) to the dir entry. We assume
-                 * here that those who do not point to a filename do not need
-                 * this info to continue the listing.
-                 *
-                 * OS/2 needs this and talks infolevel 1.
-                 * NetApps want lastname with infolevel 260.
-                 * win2k want lastname with infolevel 260, and points to
-                 *       the record not to the name.
-                 * Samba+CifsUnixExt doesn't need lastname.
-                 *
-                 * Both are happy if we return the data they point to. So we do.
-                 * (FIXME: above is not true with win2k)
-                 */
-                mask_len = 0;
-                if (info_level != SMB_FIND_FILE_UNIX &&
-                    ff_lastname > 0 && ff_lastname < req->rq_ldata) {
-                        lastname = req->rq_data + ff_lastname;
-                        switch (info_level) {
-                        case 260:
-                                mask_len = req->rq_ldata - ff_lastname;
-                                break;
-                        case 1:
-                                /* lastname points to a length byte */
-                                mask_len = *lastname++;
-                                if (ff_lastname + 1 + mask_len > req->rq_ldata)
-                                        mask_len = req->rq_ldata - ff_lastname - 1;
-                                break;
-                        }
-                        /*
-                         * Update the mask string for the next message.
-                         */
-                        if (mask_len > 255)
-                                mask_len = 255;
-                        if (mask_len)
-                                strncpy(mask, lastname, mask_len);
-                }
-                mask_len = strnlen(mask, mask_len);
-                VERBOSE("new mask, len=%d@%d of %d, mask=%.*s\n",
-                        mask_len, ff_lastname, req->rq_ldata, mask_len, mask);
-                first = 0;
-                loop_count = 0;
-        }
-out_free:
-        smb_rput(req);
-out_name:
-        kfree(name_buf);
-out:
-        unlock_kernel();
-        return result;
-}
-/*
- * This version uses the trans2 TRANSACT2_FINDFIRST message 
- * to get the attribute data.
- *
- * Bugs Noted:
- */
-static int
-smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
-                        struct smb_fattr *fattr)
-{
-        char *param, *mask;
-        __u16 date, time;
-        int mask_len, result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        mask = param + 12;
-        mask_len = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dentry,NULL);
-        if (mask_len < 0) {
-                result = mask_len;
-                goto out_free;
-        }
-        VERBOSE("name=%s, len=%d\n", mask, mask_len);
-        WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
-        WSET(param, 2, 1);      /* max count */
-        WSET(param, 4, 1);      /* close after this call */
-        WSET(param, 6, 1);      /* info_level */
-        DSET(param, 8, 0);
-        req->rq_trans2_command = TRANSACT2_FINDFIRST;
-        req->rq_ldata = 0;
-        req->rq_data  = NULL;
-        req->rq_lparm = 12 + mask_len;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        if (req->rq_rcls != 0) {
-                result = smb_errno(req);
-#ifdef SMBFS_PARANOIA
-                if (result != -ENOENT)
-                        PARANOIA("error for %s, rcls=%d, err=%d\n",
-                                 mask, req->rq_rcls, req->rq_err);
-#endif
-                goto out_free;
-        }
-        /* Make sure we got enough data ... */
-        result = -EINVAL;
-        if (req->rq_ldata < 22 || WVAL(req->rq_parm, 2) != 1) {
-                PARANOIA("bad result for %s, len=%d, count=%d\n",
-                         mask, req->rq_ldata, WVAL(req->rq_parm, 2));
-                goto out_free;
-        }
-        /*
-         * Decode the response into the fattr ...
-         */
-        date = WVAL(req->rq_data, 0);
-        time = WVAL(req->rq_data, 2);
-        fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-        fattr->f_ctime.tv_nsec = 0;
-        date = WVAL(req->rq_data, 4);
-        time = WVAL(req->rq_data, 6);
-        fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
-        fattr->f_atime.tv_nsec = 0;
-        date = WVAL(req->rq_data, 8);
-        time = WVAL(req->rq_data, 10);
-        fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-        fattr->f_mtime.tv_nsec = 0;
-        VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n",
-                mask, date, time, fattr->f_mtime.tv_sec);
-        fattr->f_size = DVAL(req->rq_data, 12);
-        /* ULONG allocation size */
-        fattr->attr = WVAL(req->rq_data, 20);
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
-                      struct smb_fattr *fattr)
-{
-        int result;
-        char *p;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, SMBgetatr, 0, 0);
-        result = smb_simple_encode_path(req, &p, dir, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        if ((result = smb_request_ok(req, SMBgetatr, 10, 0)) < 0)
-                goto out_free;
-        fattr->attr    = WVAL(req->rq_header, smb_vwv0);
-        fattr->f_mtime.tv_sec = local2utc(server, DVAL(req->rq_header, smb_vwv1));
-        fattr->f_mtime.tv_nsec = 0;
-        fattr->f_size  = DVAL(req->rq_header, smb_vwv3);
-        fattr->f_ctime = fattr->f_mtime; 
-        fattr->f_atime = fattr->f_mtime; 
-#ifdef SMBFS_DEBUG_TIMESTAMP
-        printk("getattr_core: %s/%s, mtime=%ld\n",
-               DENTRY_PATH(dir), fattr->f_mtime);
-#endif
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Bugs Noted:
- * (1) Win 95 swaps the date and time fields in the standard info level.
- */
-static int
-smb_proc_getattr_trans2(struct smb_sb_info *server, struct dentry *dir,
-                        struct smb_request *req, int infolevel)
-{
-        char *p, *param;
-        int result;
-        param = req->rq_buffer;
-        WSET(param, 0, infolevel);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
-        if (result < 0)
-                goto out;
-        p = param + 6 + result;
-        req->rq_trans2_command = TRANSACT2_QPATHINFO;
-        req->rq_ldata = 0;
-        req->rq_data  = NULL;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out;
-        if (req->rq_rcls != 0) {
-                VERBOSE("for %s: result=%d, rcls=%d, err=%d\n",
-                        &param[6], result, req->rq_rcls, req->rq_err);
-                result = smb_errno(req);
-                goto out;
-        }
-        result = -ENOENT;
-        if (req->rq_ldata < 22) {
-                PARANOIA("not enough data for %s, len=%d\n",
-                         &param[6], req->rq_ldata);
-                goto out;
-        }
-        result = 0;
-out:
-        return result;
-}
-static int
-smb_proc_getattr_trans2_std(struct smb_sb_info *server, struct dentry *dir,
-                            struct smb_fattr *attr)
-{
-        u16 date, time;
-        int off_date = 0, off_time = 2;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        result = smb_proc_getattr_trans2(server, dir, req, SMB_INFO_STANDARD);
-        if (result < 0)
-                goto out_free;
-        /*
-         * Kludge alert: Win 95 swaps the date and time field,
-         * contrary to the CIFS docs and Win NT practice.
-         */
-        if (server->mnt->flags & SMB_MOUNT_WIN95) {
-                off_date = 2;
-                off_time = 0;
-        }
-        date = WVAL(req->rq_data, off_date);
-        time = WVAL(req->rq_data, off_time);
-        attr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-        attr->f_ctime.tv_nsec = 0;
-        date = WVAL(req->rq_data, 4 + off_date);
-        time = WVAL(req->rq_data, 4 + off_time);
-        attr->f_atime.tv_sec = date_dos2unix(server, date, time);
-        attr->f_atime.tv_nsec = 0;
-        date = WVAL(req->rq_data, 8 + off_date);
-        time = WVAL(req->rq_data, 8 + off_time);
-        attr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-        attr->f_mtime.tv_nsec = 0;
-#ifdef SMBFS_DEBUG_TIMESTAMP
-        printk(KERN_DEBUG "getattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
-               DENTRY_PATH(dir), date, time, attr->f_mtime);
-#endif
-        attr->f_size = DVAL(req->rq_data, 12);
-        attr->attr = WVAL(req->rq_data, 20);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_getattr_trans2_all(struct smb_sb_info *server, struct dentry *dir,
-                            struct smb_fattr *attr)
-{
-        struct smb_request *req;
-        int result;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        result = smb_proc_getattr_trans2(server, dir, req,
-                                         SMB_QUERY_FILE_ALL_INFO);
-        if (result < 0)
-                goto out_free;
-        attr->f_ctime = smb_ntutc2unixutc(LVAL(req->rq_data, 0));
-        attr->f_atime = smb_ntutc2unixutc(LVAL(req->rq_data, 8));
-        attr->f_mtime = smb_ntutc2unixutc(LVAL(req->rq_data, 16));
-        /* change (24) */
-        attr->attr = WVAL(req->rq_data, 32);
-        /* pad? (34) */
-        /* allocated size (40) */
-        attr->f_size = LVAL(req->rq_data, 48);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_getattr_unix(struct smb_sb_info *server, struct dentry *dir,
-                      struct smb_fattr *attr)
-{
-        struct smb_request *req;
-        int result;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        result = smb_proc_getattr_trans2(server, dir, req,
-                                         SMB_QUERY_FILE_UNIX_BASIC);
-        if (result < 0)
-                goto out_free;
-        smb_decode_unix_basic(attr, server, req->rq_data);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_getattr_95(struct smb_sb_info *server, struct dentry *dir,
-                    struct smb_fattr *attr)
-{
-        struct inode *inode = dir->d_inode;
-        int result;
-        /* FIXME: why not use the "all" version? */
-        result = smb_proc_getattr_trans2_std(server, dir, attr);
-        if (result < 0)
-                goto out;
-        /*
-         * None of the getattr versions here can make win9x return the right
-         * filesize if there are changes made to an open file.
-         * A seek-to-end does return the right size, but we only need to do
-         * that on files we have written.
-         */
-        if (inode && SMB_I(inode)->flags & SMB_F_LOCALWRITE &&
-            smb_is_open(inode))
-        {
-                __u16 fileid = SMB_I(inode)->fileid;
-                attr->f_size = smb_proc_seek(server, fileid, 2, 0);
-        }
-out:
-        return result;
-}
-static int
-smb_proc_ops_wait(struct smb_sb_info *server)
-{
-        int result;
-        result = wait_event_interruptible_timeout(server->conn_wq,
-                                server->conn_complete, 30*HZ);
-        if (!result || signal_pending(current))
-                return -EIO;
-        return 0;
-}
-static int
-smb_proc_getattr_null(struct smb_sb_info *server, struct dentry *dir,
-                          struct smb_fattr *fattr)
-{
-        int result;
-        if (smb_proc_ops_wait(server) < 0)
-                return -EIO;
-        smb_init_dirent(server, fattr);
-        result = server->ops->getattr(server, dir, fattr);
-        smb_finish_dirent(server, fattr);
-        return result;
-}
-static int
-smb_proc_readdir_null(struct file *filp, void *dirent, filldir_t filldir,
-                      struct smb_cache_control *ctl)
-{
-        struct smb_sb_info *server = server_from_dentry(filp->f_path.dentry);
-        if (smb_proc_ops_wait(server) < 0)
-                return -EIO;
-        return server->ops->readdir(filp, dirent, filldir, ctl);
-}
-int
-smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr)
-{
-        struct smb_sb_info *server = server_from_dentry(dir);
-        int result;
-        smb_init_dirent(server, fattr);
-        result = server->ops->getattr(server, dir, fattr);
-        smb_finish_dirent(server, fattr);
-        return result;
-}
-/*
- * Because of bugs in the core protocol, we use this only to set
- * attributes. See smb_proc_settime() below for timestamp handling.
- *
- * Bugs Noted:
- * (1) If mtime is non-zero, both Win 3.1 and Win 95 fail
- * with an undocumented error (ERRDOS code 50). Setting
- * mtime to 0 allows the attributes to be set.
- * (2) The extra parameters following the name string aren't
- * in the CIFS docs, but seem to be necessary for operation.
- */
-static int
-smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
-                      __u16 attr)
-{
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, SMBsetatr, 8, 0);
-        WSET(req->rq_header, smb_vwv0, attr);
-        DSET(req->rq_header, smb_vwv1, 0); /* mtime */
-        WSET(req->rq_header, smb_vwv3, 0); /* reserved values */
-        WSET(req->rq_header, smb_vwv4, 0);
-        WSET(req->rq_header, smb_vwv5, 0);
-        WSET(req->rq_header, smb_vwv6, 0);
-        WSET(req->rq_header, smb_vwv7, 0);
-        result = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        if (p + 2 > (char *)req->rq_buffer + req->rq_bufsize) {
-                result = -ENAMETOOLONG;
-                goto out_free;
-        }
-        *p++ = 4;
-        *p++ = 0;
-        smb_setup_bcc(req, p);
-        result = smb_request_ok(req, SMBsetatr, 0, 0);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Because of bugs in the trans2 setattr messages, we must set
- * attributes and timestamps separately. The core SMBsetatr
- * message seems to be the only reliable way to set attributes.
- */
-int
-smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr)
-{
-        struct smb_sb_info *server = server_from_dentry(dir);
-        int result;
-        VERBOSE("setting %s/%s, open=%d\n", 
-                DENTRY_PATH(dir), smb_is_open(dir->d_inode));
-        result = smb_proc_setattr_core(server, dir, fattr->attr);
-        return result;
-}
-/*
- * Sets the timestamps for an file open with write permissions.
- */
-static int
-smb_proc_setattr_ext(struct smb_sb_info *server,
-                      struct inode *inode, struct smb_fattr *fattr)
-{
-        __u16 date, time;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBsetattrE, 7, 0);
-        WSET(req->rq_header, smb_vwv0, SMB_I(inode)->fileid);
-        /* We don't change the creation time */
-        WSET(req->rq_header, smb_vwv1, 0);
-        WSET(req->rq_header, smb_vwv2, 0);
-        date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
-        WSET(req->rq_header, smb_vwv3, date);
-        WSET(req->rq_header, smb_vwv4, time);
-        date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
-        WSET(req->rq_header, smb_vwv5, date);
-        WSET(req->rq_header, smb_vwv6, time);
-#ifdef SMBFS_DEBUG_TIMESTAMP
-        printk(KERN_DEBUG "smb_proc_setattr_ext: date=%d, time=%d, mtime=%ld\n",
-               date, time, fattr->f_mtime);
-#endif
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBsetattrE, 0, 0);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Bugs Noted:
- * (1) The TRANSACT2_SETPATHINFO message under Win NT 4.0 doesn't
- * set the file's attribute flags.
- */
-static int
-smb_proc_setattr_trans2(struct smb_sb_info *server,
-                        struct dentry *dir, struct smb_fattr *fattr)
-{
-        __u16 date, time;
-        char *p, *param;
-        int result;
-        char data[26];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        WSET(param, 0, 1);      /* Info level SMB_INFO_STANDARD */
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        WSET(data, 0, 0); /* creation time */
-        WSET(data, 2, 0);
-        date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
-        WSET(data, 4, date);
-        WSET(data, 6, time);
-        date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
-        WSET(data, 8, date);
-        WSET(data, 10, time);
-#ifdef SMBFS_DEBUG_TIMESTAMP
-        printk(KERN_DEBUG "setattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n", 
-               DENTRY_PATH(dir), date, time, fattr->f_mtime);
-#endif
-        DSET(data, 12, 0); /* size */
-        DSET(data, 16, 0); /* blksize */
-        WSET(data, 20, 0); /* attr */
-        DSET(data, 22, 0); /* ULONG EA size */
-        req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-        req->rq_ldata = 26;
-        req->rq_data  = data;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-        if (req->rq_rcls != 0)
-                result = smb_errno(req);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * ATTR_MODE      0x001
- * ATTR_UID       0x002
- * ATTR_GID       0x004
- * ATTR_SIZE      0x008
- * ATTR_ATIME     0x010
- * ATTR_MTIME     0x020
- * ATTR_CTIME     0x040
- * ATTR_ATIME_SET 0x080
- * ATTR_MTIME_SET 0x100
- * ATTR_FORCE     0x200 
- * ATTR_ATTR_FLAG 0x400
- *
- * major/minor should only be set by mknod.
- */
-int
-smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
-                      unsigned int major, unsigned int minor)
-{
-        struct smb_sb_info *server = server_from_dentry(d);
-        u64 nttime;
-        char *p, *param;
-        int result;
-        char data[100];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        DEBUG1("valid flags = 0x%04x\n", attr->ia_valid);
-        WSET(param, 0, SMB_SET_FILE_UNIX_BASIC);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        /* 0 L file size in bytes */
-        /* 8 L file size on disk in bytes (block count) */
-        /* 40 L uid */
-        /* 48 L gid */
-        /* 56 W file type enum */
-        /* 60 L devmajor */
-        /* 68 L devminor */
-        /* 76 L unique ID (inode) */
-        /* 84 L permissions */
-        /* 92 L link count */
-        LSET(data, 0, SMB_SIZE_NO_CHANGE);
-        LSET(data, 8, SMB_SIZE_NO_CHANGE);
-        LSET(data, 16, SMB_TIME_NO_CHANGE);
-        LSET(data, 24, SMB_TIME_NO_CHANGE);
-        LSET(data, 32, SMB_TIME_NO_CHANGE);
-        LSET(data, 40, SMB_UID_NO_CHANGE);
-        LSET(data, 48, SMB_GID_NO_CHANGE);
-        DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
-        LSET(data, 60, major);
-        LSET(data, 68, minor);
-        LSET(data, 76, 0);
-        LSET(data, 84, SMB_MODE_NO_CHANGE);
-        LSET(data, 92, 0);
-        if (attr->ia_valid & ATTR_SIZE) {
-                LSET(data, 0, attr->ia_size);
-                LSET(data, 8, 0); /* can't set anyway */
-        }
-        /*
-         * FIXME: check the conversion function it the correct one
-         *
-         * we can't set ctime but we might as well pass this to the server
-         * and let it ignore it.
-         */
-        if (attr->ia_valid & ATTR_CTIME) {
-                nttime = smb_unixutc2ntutc(attr->ia_ctime);
-                LSET(data, 16, nttime);
-        }
-        if (attr->ia_valid & ATTR_ATIME) {
-                nttime = smb_unixutc2ntutc(attr->ia_atime);
-                LSET(data, 24, nttime);
-        }
-        if (attr->ia_valid & ATTR_MTIME) {
-                nttime = smb_unixutc2ntutc(attr->ia_mtime);
-                LSET(data, 32, nttime);
-        }
-        
-        if (attr->ia_valid & ATTR_UID) {
-                LSET(data, 40, attr->ia_uid);
-        }
-        if (attr->ia_valid & ATTR_GID) {
-                LSET(data, 48, attr->ia_gid); 
-        }
-        
-        if (attr->ia_valid & ATTR_MODE) {
-                LSET(data, 84, attr->ia_mode);
-        }
-        req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-        req->rq_ldata = 100;
-        req->rq_data  = data;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Set the modify and access timestamps for a file.
- *
- * Incredibly enough, in all of SMB there is no message to allow
- * setting both attributes and timestamps at once. 
- *
- * Bugs Noted:
- * (1) Win 95 doesn't support the TRANSACT2_SETFILEINFO message 
- * with info level 1 (INFO_STANDARD).
- * (2) Win 95 seems not to support setting directory timestamps.
- * (3) Under the core protocol apparently the only way to set the
- * timestamp is to open and close the file.
- */
-int
-smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        struct inode *inode = dentry->d_inode;
-        int result;
-        VERBOSE("setting %s/%s, open=%d\n",
-                DENTRY_PATH(dentry), smb_is_open(inode));
-        /* setting the time on a Win95 server fails (tridge) */
-        if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2 && 
-            !(server->mnt->flags & SMB_MOUNT_WIN95)) {
-                if (smb_is_open(inode) && SMB_I(inode)->access != SMB_O_RDONLY)
-                        result = smb_proc_setattr_ext(server, inode, fattr);
-                else
-                        result = smb_proc_setattr_trans2(server, dentry, fattr);
-        } else {
-                /*
-                 * Fail silently on directories ... timestamp can't be set?
-                 */
-                result = 0;
-                if (S_ISREG(inode->i_mode)) {
-                        /*
-                         * Set the mtime by opening and closing the file.
-                         * Note that the file is opened read-only, but this
-                         * still allows us to set the date (tridge)
-                         */
-                        result = -EACCES;
-                        if (!smb_is_open(inode))
-                                smb_proc_open(server, dentry, SMB_O_RDONLY);
-                        if (smb_is_open(inode)) {
-                                inode->i_mtime = fattr->f_mtime;
-                                result = smb_proc_close_inode(server, inode);
-                        }
-                }
-        }
-        return result;
-}
-int
-smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
-{
-        struct smb_sb_info *server = SMB_SB(dentry->d_sb);
-        int result;
-        char *p;
-        long unit;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBdskattr, 0, 0);
-        if ((result = smb_request_ok(req, SMBdskattr, 5, 0)) < 0)
-                goto out_free;
-        p = SMB_VWV(req->rq_header);
-        unit = (WVAL(p, 2) * WVAL(p, 4)) >> SMB_ST_BLKSHIFT;
-        attr->f_blocks = WVAL(p, 0) * unit;
-        attr->f_bsize  = SMB_ST_BLKSIZE;
-        attr->f_bavail = attr->f_bfree = WVAL(p, 6) * unit;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_read_link(struct smb_sb_info *server, struct dentry *d,
-                   char *buffer, int len)
-{
-        char *p, *param;
-        int result;
-        struct smb_request *req;
-        DEBUG1("readlink of %s/%s\n", DENTRY_PATH(d));
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        WSET(param, 0, SMB_QUERY_FILE_UNIX_LINK);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        req->rq_trans2_command = TRANSACT2_QPATHINFO;
-        req->rq_ldata = 0;
-        req->rq_data  = NULL;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-                &param[6], result, req->rq_rcls, req->rq_err);
-        /* copy data up to the \0 or buffer length */
-        result = len;
-        if (req->rq_ldata < len)
-                result = req->rq_ldata;
-        strncpy(buffer, req->rq_data, result);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Create a symlink object called dentry which points to oldpath.
- * Samba does not permit dangling links but returns a suitable error message.
- */
-int
-smb_proc_symlink(struct smb_sb_info *server, struct dentry *d,
-                 const char *oldpath)
-{
-        char *p, *param;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        WSET(param, 0, SMB_SET_FILE_UNIX_LINK);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1, d, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-        req->rq_ldata = strlen(oldpath) + 1;
-        req->rq_data  = (char *) oldpath;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-                &param[6], result, req->rq_rcls, req->rq_err);
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Create a hard link object called new_dentry which points to dentry.
- */
-int
-smb_proc_link(struct smb_sb_info *server, struct dentry *dentry,
-              struct dentry *new_dentry)
-{
-        char *p, *param;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        WSET(param, 0, SMB_SET_FILE_UNIX_HLINK);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1,
-                                 new_dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        /* Grr, pointless separation of parameters and data ... */
-        req->rq_data = p;
-        req->rq_ldata = smb_encode_path(server, p, SMB_MAXPATHLEN+1,
-                                        dentry, NULL);
-        req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-               &param[6], result, req->rq_rcls, req->rq_err);
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_query_cifsunix(struct smb_sb_info *server)
-{
-        int result;
-        int major, minor;
-        u64 caps;
-        char param[2];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 100)))
-                goto out;
-        WSET(param, 0, SMB_QUERY_CIFS_UNIX_INFO);
-        req->rq_trans2_command = TRANSACT2_QFSINFO;
-        req->rq_ldata = 0;
-        req->rq_data  = NULL;
-        req->rq_lparm = 2;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        if (req->rq_ldata < 12) {
-                PARANOIA("Not enough data\n");
-                goto out_free;
-        }
-        major = WVAL(req->rq_data, 0);
-        minor = WVAL(req->rq_data, 2);
-        DEBUG1("Server implements CIFS Extensions for UNIX systems v%d.%d\n",
-               major, minor);
-        /* FIXME: verify that we are ok with this major/minor? */
-        caps = LVAL(req->rq_data, 4);
-        DEBUG1("Server capabilities 0x%016llx\n", caps);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static void
-install_ops(struct smb_ops *dst, struct smb_ops *src)
-{
-        memcpy(dst, src, sizeof(void *) * SMB_OPS_NUM_STATIC);
-}
-/* < LANMAN2 */
-static struct smb_ops smb_ops_core =
-{
-        .read           = smb_proc_read,
-        .write          = smb_proc_write,
-        .readdir        = smb_proc_readdir_short,
-        .getattr        = smb_proc_getattr_core,
-        .truncate       = smb_proc_trunc32,
-};
-/* LANMAN2, OS/2, others? */
-static struct smb_ops smb_ops_os2 =
-{
-        .read           = smb_proc_read,
-        .write          = smb_proc_write,
-        .readdir        = smb_proc_readdir_long,
-        .getattr        = smb_proc_getattr_trans2_std,
-        .truncate       = smb_proc_trunc32,
-};
-/* Win95, and possibly some NetApp versions too */
-static struct smb_ops smb_ops_win95 =
-{
-        .read           = smb_proc_read,    /* does not support 12word readX */
-        .write          = smb_proc_write,
-        .readdir        = smb_proc_readdir_long,
-        .getattr        = smb_proc_getattr_95,
-        .truncate       = smb_proc_trunc95,
-};
-/* Samba, NT4 and NT5 */
-static struct smb_ops smb_ops_winNT =
-{
-        .read           = smb_proc_readX,
-        .write          = smb_proc_writeX,
-        .readdir        = smb_proc_readdir_long,
-        .getattr        = smb_proc_getattr_trans2_all,
-        .truncate       = smb_proc_trunc64,
-};
-/* Samba w/ unix extensions. Others? */
-static struct smb_ops smb_ops_unix =
-{
-        .read           = smb_proc_readX,
-        .write          = smb_proc_writeX,
-        .readdir        = smb_proc_readdir_long,
-        .getattr        = smb_proc_getattr_unix,
-        /* FIXME: core/ext/time setattr needs to be cleaned up! */
-        /* .setattr     = smb_proc_setattr_unix, */
-        .truncate       = smb_proc_trunc64,
-};
-/* Place holder until real ops are in place */
-static struct smb_ops smb_ops_null =
-{
-        .readdir        = smb_proc_readdir_null,
-        .getattr        = smb_proc_getattr_null,
-};
-void smb_install_null_ops(struct smb_ops *ops)
-{
-        install_ops(ops, &smb_ops_null);
-}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
deleted file mode 100644
index 05939a6f43e6..000000000000
--- a/fs/smbfs/proto.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Autogenerated with cproto on:  Sat Sep 13 17:18:51 CEST 2003
- */
-struct smb_request;
-struct sock;
-struct statfs;
-/* proc.c */
-extern int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp);
-extern __u32 smb_len(__u8 *p);
-extern int smb_get_rsize(struct smb_sb_info *server);
-extern int smb_get_wsize(struct smb_sb_info *server);
-extern int smb_errno(struct smb_request *req);
-extern int smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt);
-extern __u8 *smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc);
-extern int smb_open(struct dentry *dentry, int wish);
-extern int smb_close(struct inode *ino);
-extern int smb_close_fileid(struct dentry *dentry, __u16 fileid);
-extern int smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid);
-extern int smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry);
-extern int smb_proc_mkdir(struct dentry *dentry);
-extern int smb_proc_rmdir(struct dentry *dentry);
-extern int smb_proc_unlink(struct dentry *dentry);
-extern int smb_proc_flush(struct smb_sb_info *server, __u16 fileid);
-extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
-                                 struct super_block *sb);
-extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
-extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
-extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
-extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
-extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
-extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
-extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
-extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
-extern void smb_install_null_ops(struct smb_ops *ops);
-/* dir.c */
-extern const struct file_operations smb_dir_operations;
-extern const struct inode_operations smb_dir_inode_operations;
-extern const struct inode_operations smb_dir_inode_operations_unix;
-extern void smb_new_dentry(struct dentry *dentry);
-extern void smb_renew_times(struct dentry *dentry);
-/* cache.c */
-extern void smb_invalid_dir_cache(struct inode *dir);
-extern void smb_invalidate_dircache_entries(struct dentry *parent);
-extern struct dentry *smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos);
-extern int smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctrl, struct qstr *qname, struct smb_fattr *entry);
-/* sock.c */
-extern void smb_data_ready(struct sock *sk, int len);
-extern int smb_valid_socket(struct inode *inode);
-extern void smb_close_socket(struct smb_sb_info *server);
-extern int smb_recv_available(struct smb_sb_info *server);
-extern int smb_receive_header(struct smb_sb_info *server);
-extern int smb_receive_drop(struct smb_sb_info *server);
-extern int smb_receive(struct smb_sb_info *server, struct smb_request *req);
-extern int smb_send_request(struct smb_request *req);
-/* inode.c */
-extern struct inode *smb_iget(struct super_block *sb, struct smb_fattr *fattr);
-extern void smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr);
-extern void smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr);
-extern void smb_invalidate_inodes(struct smb_sb_info *server);
-extern int smb_revalidate_inode(struct dentry *dentry);
-extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
-extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
-/* file.c */
-extern const struct address_space_operations smb_file_aops;
-extern const struct file_operations smb_file_operations;
-extern const struct inode_operations smb_file_inode_operations;
-/* ioctl.c */
-extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-/* smbiod.c */
-extern void smbiod_wake_up(void);
-extern int smbiod_register_server(struct smb_sb_info *server);
-extern void smbiod_unregister_server(struct smb_sb_info *server);
-extern void smbiod_flush(struct smb_sb_info *server);
-extern int smbiod_retry(struct smb_sb_info *server);
-/* request.c */
-extern int smb_init_request_cache(void);
-extern void smb_destroy_request_cache(void);
-extern struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize);
-extern void smb_rput(struct smb_request *req);
-extern int smb_add_request(struct smb_request *req);
-extern int smb_request_send_server(struct smb_sb_info *server);
-extern int smb_request_recv(struct smb_sb_info *server);
-/* symlink.c */
-extern int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname);
-extern const struct inode_operations smb_link_inode_operations;
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
deleted file mode 100644
index 45f45933e862..000000000000
--- a/fs/smbfs/request.c
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
- *  request.c
- *
- *  Copyright (C) 2001 by Urban Widmark
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/net.h>
-#include <linux/sched.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-#include "smb_debug.h"
-#include "request.h"
-#include "proto.h"
-/* #define SMB_SLAB_DEBUG       (SLAB_RED_ZONE | SLAB_POISON) */
-#define SMB_SLAB_DEBUG  0
-/* cache for request structures */
-static struct kmem_cache *req_cachep;
-static int smb_request_send_req(struct smb_request *req);
-/*
-  /proc/slabinfo:
-  name, active, num, objsize, active_slabs, num_slaps, #pages
-*/
-int smb_init_request_cache(void)
-{
-        req_cachep = kmem_cache_create("smb_request",
-                                       sizeof(struct smb_request), 0,
-                                       SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN,
-                                       NULL);
-        if (req_cachep == NULL)
-                return -ENOMEM;
-        return 0;
-}
-void smb_destroy_request_cache(void)
-{
-        kmem_cache_destroy(req_cachep);
-}
-/*
- * Allocate and initialise a request structure
- */
-static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
-                                                int bufsize)
-{
-        struct smb_request *req;
-        unsigned char *buf = NULL;
-        req = kmem_cache_zalloc(req_cachep, GFP_KERNEL);
-        VERBOSE("allocating request: %p\n", req);
-        if (!req)
-                goto out;
-        if (bufsize > 0) {
-                buf = kmalloc(bufsize, GFP_NOFS);
-                if (!buf) {
-                        kmem_cache_free(req_cachep, req);
-                        return NULL;
-                }
-        }
-        req->rq_buffer = buf;
-        req->rq_bufsize = bufsize;
-        req->rq_server = server;
-        init_waitqueue_head(&req->rq_wait);
-        INIT_LIST_HEAD(&req->rq_queue);
-        atomic_set(&req->rq_count, 1);
-out:
-        return req;
-}
-struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize)
-{
-        struct smb_request *req = NULL;
-        for (;;) {
-                atomic_inc(&server->nr_requests);
-                if (atomic_read(&server->nr_requests) <= MAX_REQUEST_HARD) {
-                        req = smb_do_alloc_request(server, bufsize);
-                        if (req != NULL)
-                                break;
-                }
-#if 0
-                /*
-                 * Try to free up at least one request in order to stay
-                 * below the hard limit
-                 */
-                if (nfs_try_to_free_pages(server))
-                        continue;
-                if (fatal_signal_pending(current))
-                        return ERR_PTR(-ERESTARTSYS);
-                current->policy = SCHED_YIELD;
-                schedule();
-#else
-                /* FIXME: we want something like nfs does above, but that
-                   requires changes to all callers and can wait. */
-                break;
-#endif
-        }
-        return req;
-}
-static void smb_free_request(struct smb_request *req)
-{
-        atomic_dec(&req->rq_server->nr_requests);
-        if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC))
-                kfree(req->rq_buffer);
-        kfree(req->rq_trans2buffer);
-        kmem_cache_free(req_cachep, req);
-}
-/*
- * What prevents a rget to race with a rput? The count must never drop to zero
- * while it is in use. Only rput if it is ok that it is free'd.
- */
-static void smb_rget(struct smb_request *req)
-{
-        atomic_inc(&req->rq_count);
-}
-void smb_rput(struct smb_request *req)
-{
-        if (atomic_dec_and_test(&req->rq_count)) {
-                list_del_init(&req->rq_queue);
-                smb_free_request(req);
-        }
-}
-/* setup to receive the data part of the SMB */
-static int smb_setup_bcc(struct smb_request *req)
-{
-        int result = 0;
-        req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-        if (req->rq_rlen > req->rq_bufsize) {
-                PARANOIA("Packet too large %d > %d\n",
-                         req->rq_rlen, req->rq_bufsize);
-                return -ENOBUFS;
-        }
-        req->rq_iov[0].iov_base = req->rq_buffer;
-        req->rq_iov[0].iov_len  = req->rq_rlen;
-        req->rq_iovlen = 1;
-        return result;
-}
-/*
- * Prepare a "normal" request structure.
- */
-static int smb_setup_request(struct smb_request *req)
-{
-        int len = smb_len(req->rq_header) + 4;
-        req->rq_slen = len;
-        /* if we expect a data part in the reply we set the iov's to read it */
-        if (req->rq_resp_bcc)
-                req->rq_setup_read = smb_setup_bcc;
-        /* This tries to support re-using the same request */
-        req->rq_bytes_sent = 0;
-        req->rq_rcls = 0;
-        req->rq_err = 0;
-        req->rq_errno = 0;
-        req->rq_fragment = 0;
-        kfree(req->rq_trans2buffer);
-        req->rq_trans2buffer = NULL;
-        return 0;
-}
-/*
- * Prepare a transaction2 request structure
- */
-static int smb_setup_trans2request(struct smb_request *req)
-{
-        struct smb_sb_info *server = req->rq_server;
-        int mparam, mdata;
-        static unsigned char padding[4];
-        /* I know the following is very ugly, but I want to build the
-           smb packet as efficiently as possible. */
-        const int smb_parameters = 15;
-        const int header = SMB_HEADER_LEN + 2 * smb_parameters + 2;
-        const int oparam = ALIGN(header + 3, sizeof(u32));
-        const int odata  = ALIGN(oparam + req->rq_lparm, sizeof(u32));
-        const int bcc = (req->rq_data ? odata + req->rq_ldata :
-                                        oparam + req->rq_lparm) - header;
-        if ((bcc + oparam) > server->opt.max_xmit)
-                return -ENOMEM;
-        smb_setup_header(req, SMBtrans2, smb_parameters, bcc);
-        /*
-         * max parameters + max data + max setup == bufsize to make NT4 happy
-         * and not abort the transfer or split into multiple responses. It also
-         * makes smbfs happy as handling packets larger than the buffer size
-         * is extra work.
-         *
-         * OS/2 is probably going to hate me for this ...
-         */
-        mparam = SMB_TRANS2_MAX_PARAM;
-        mdata = req->rq_bufsize - mparam;
-        mdata = server->opt.max_xmit - mparam - 100;
-        if (mdata < 1024) {
-                mdata = 1024;
-                mparam = 20;
-        }
-#if 0
-        /* NT/win2k has ~4k max_xmit, so with this we request more than it wants
-           to return as one SMB. Useful for testing the fragmented trans2
-           handling. */
-        mdata = 8192;
-#endif
-        WSET(req->rq_header, smb_tpscnt, req->rq_lparm);
-        WSET(req->rq_header, smb_tdscnt, req->rq_ldata);
-        WSET(req->rq_header, smb_mprcnt, mparam);
-        WSET(req->rq_header, smb_mdrcnt, mdata);
-        WSET(req->rq_header, smb_msrcnt, 0);    /* max setup always 0 ? */
-        WSET(req->rq_header, smb_flags, 0);
-        DSET(req->rq_header, smb_timeout, 0);
-        WSET(req->rq_header, smb_pscnt, req->rq_lparm);
-        WSET(req->rq_header, smb_psoff, oparam - 4);
-        WSET(req->rq_header, smb_dscnt, req->rq_ldata);
-        WSET(req->rq_header, smb_dsoff, req->rq_data ? odata - 4 : 0);
-        *(req->rq_header + smb_suwcnt) = 0x01;          /* setup count */
-        *(req->rq_header + smb_suwcnt + 1) = 0x00;      /* reserved */
-        WSET(req->rq_header, smb_setup0, req->rq_trans2_command);
-        req->rq_iovlen = 2;
-        req->rq_iov[0].iov_base = (void *) req->rq_header;
-        req->rq_iov[0].iov_len = oparam;
-        req->rq_iov[1].iov_base = (req->rq_parm==NULL) ? padding : req->rq_parm;
-        req->rq_iov[1].iov_len = req->rq_lparm;
-        req->rq_slen = oparam + req->rq_lparm;
-        if (req->rq_data) {
-                req->rq_iovlen += 2;
-                req->rq_iov[2].iov_base = padding;
-                req->rq_iov[2].iov_len = odata - oparam - req->rq_lparm;
-                req->rq_iov[3].iov_base = req->rq_data;
-                req->rq_iov[3].iov_len = req->rq_ldata;
-                req->rq_slen = odata + req->rq_ldata;
-        }
-        /* always a data part for trans2 replies */
-        req->rq_setup_read = smb_setup_bcc;
-        return 0;
-}
-/*
- * Add a request and tell smbiod to process it
- */
-int smb_add_request(struct smb_request *req)
-{
-        long timeleft;
-        struct smb_sb_info *server = req->rq_server;
-        int result = 0;
-        smb_setup_request(req);
-        if (req->rq_trans2_command) {
-                if (req->rq_buffer == NULL) {
-                        PARANOIA("trans2 attempted without response buffer!\n");
-                        return -EIO;
-                }
-                result = smb_setup_trans2request(req);
-        }
-        if (result < 0)
-                return result;
-#ifdef SMB_DEBUG_PACKET_SIZE
-        add_xmit_stats(req);
-#endif
-        /* add 'req' to the queue of requests */
-        if (smb_lock_server_interruptible(server))
-                return -EINTR;
-        /*
-         * Try to send the request as the process. If that fails we queue the
-         * request and let smbiod send it later.
-         */
-        /* FIXME: each server has a number on the maximum number of parallel
-           requests. 10, 50 or so. We should not allow more requests to be
-           active. */
-        if (server->mid > 0xf000)
-                server->mid = 0;
-        req->rq_mid = server->mid++;
-        WSET(req->rq_header, smb_mid, req->rq_mid);
-        result = 0;
-        if (server->state == CONN_VALID) {
-                if (list_empty(&server->xmitq))
-                        result = smb_request_send_req(req);
-                if (result < 0) {
-                        /* Connection lost? */
-                        server->conn_error = result;
-                        server->state = CONN_INVALID;
-                }
-        }
-        if (result != 1)
-                list_add_tail(&req->rq_queue, &server->xmitq);
-        smb_rget(req);
-        if (server->state != CONN_VALID)
-                smbiod_retry(server);
-        smb_unlock_server(server);
-        smbiod_wake_up();
-        timeleft = wait_event_interruptible_timeout(req->rq_wait,
-                                    req->rq_flags & SMB_REQ_RECEIVED, 30*HZ);
-        if (!timeleft || signal_pending(current)) {
-                /*
-                 * On timeout or on interrupt we want to try and remove the
-                 * request from the recvq/xmitq.
-                 * First check if the request is still part of a queue. (May
-                 * have been removed by some error condition)
-                 */
-                smb_lock_server(server);
-                if (!list_empty(&req->rq_queue)) {
-                        list_del_init(&req->rq_queue);
-                        smb_rput(req);
-                }
-                smb_unlock_server(server);
-        }
-        if (!timeleft) {
-                PARANOIA("request [%p, mid=%d] timed out!\n",
-                         req, req->rq_mid);
-                VERBOSE("smb_com:  %02x\n", *(req->rq_header + smb_com));
-                VERBOSE("smb_rcls: %02x\n", *(req->rq_header + smb_rcls));
-                VERBOSE("smb_flg:  %02x\n", *(req->rq_header + smb_flg));
-                VERBOSE("smb_tid:  %04x\n", WVAL(req->rq_header, smb_tid));
-                VERBOSE("smb_pid:  %04x\n", WVAL(req->rq_header, smb_pid));
-                VERBOSE("smb_uid:  %04x\n", WVAL(req->rq_header, smb_uid));
-                VERBOSE("smb_mid:  %04x\n", WVAL(req->rq_header, smb_mid));
-                VERBOSE("smb_wct:  %02x\n", *(req->rq_header + smb_wct));
-                req->rq_rcls = ERRSRV;
-                req->rq_err  = ERRtimeout;
-                /* Just in case it was "stuck" */
-                smbiod_wake_up();
-        }
-        VERBOSE("woke up, rcls=%d\n", req->rq_rcls);
-        if (req->rq_rcls != 0)
-                req->rq_errno = smb_errno(req);
-        if (signal_pending(current))
-                req->rq_errno = -ERESTARTSYS;
-        return req->rq_errno;
-}
-/*
- * Send a request and place it on the recvq if successfully sent.
- * Must be called with the server lock held.
- */
-static int smb_request_send_req(struct smb_request *req)
-{
-        struct smb_sb_info *server = req->rq_server;
-        int result;
-        if (req->rq_bytes_sent == 0) {
-                WSET(req->rq_header, smb_tid, server->opt.tid);
-                WSET(req->rq_header, smb_pid, 1);
-                WSET(req->rq_header, smb_uid, server->opt.server_uid);
-        }
-        result = smb_send_request(req);
-        if (result < 0 && result != -EAGAIN)
-                goto out;
-        result = 0;
-        if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
-                goto out;
-        list_move_tail(&req->rq_queue, &server->recvq);
-        result = 1;
-out:
-        return result;
-}
-/*
- * Sends one request for this server. (smbiod)
- * Must be called with the server lock held.
- * Returns: <0 on error
- *           0 if no request could be completely sent
- *           1 if all data for one request was sent
- */
-int smb_request_send_server(struct smb_sb_info *server)
-{
-        struct list_head *head;
-        struct smb_request *req;
-        int result;
-        if (server->state != CONN_VALID)
-                return 0;
-        /* dequeue first request, if any */
-        req = NULL;
-        head = server->xmitq.next;
-        if (head != &server->xmitq) {
-                req = list_entry(head, struct smb_request, rq_queue);
-        }
-        if (!req)
-                return 0;
-        result = smb_request_send_req(req);
-        if (result < 0) {
-                server->conn_error = result;
-                list_move(&req->rq_queue, &server->xmitq);
-                result = -EIO;
-                goto out;
-        }
-out:
-        return result;
-}
-/*
- * Try to find a request matching this "mid". Typically the first entry will
- * be the matching one.
- */
-static struct smb_request *find_request(struct smb_sb_info *server, int mid)
-{
-        struct list_head *tmp;
-        struct smb_request *req = NULL;
-        list_for_each(tmp, &server->recvq) {
-                req = list_entry(tmp, struct smb_request, rq_queue);
-                if (req->rq_mid == mid) {
-                        break;
-                }
-                req = NULL;
-        }
-        if (!req) {
-                VERBOSE("received reply with mid %d but no request!\n",
-                        WVAL(server->header, smb_mid));
-                server->rstate = SMB_RECV_DROP;
-        }
-        return req;
-}
-/*
- * Called when we have read the smb header and believe this is a response.
- */
-static int smb_init_request(struct smb_sb_info *server, struct smb_request *req)
-{
-        int hdrlen, wct;
-        memcpy(req->rq_header, server->header, SMB_HEADER_LEN);
-        wct = *(req->rq_header + smb_wct);
-        if (wct > 20) { 
-                PARANOIA("wct too large, %d > 20\n", wct);
-                server->rstate = SMB_RECV_DROP;
-                return 0;
-        }
-        req->rq_resp_wct = wct;
-        hdrlen = SMB_HEADER_LEN + wct*2 + 2;
-        VERBOSE("header length: %d   smb_wct: %2d\n", hdrlen, wct);
-        req->rq_bytes_recvd = SMB_HEADER_LEN;
-        req->rq_rlen = hdrlen;
-        req->rq_iov[0].iov_base = req->rq_header;
-        req->rq_iov[0].iov_len  = hdrlen;
-        req->rq_iovlen = 1;
-        server->rstate = SMB_RECV_PARAM;
-#ifdef SMB_DEBUG_PACKET_SIZE
-        add_recv_stats(smb_len(server->header));
-#endif
-        return 0;
-}
-/*
- * Reads the SMB parameters
- */
-static int smb_recv_param(struct smb_sb_info *server, struct smb_request *req)
-{
-        int result;
-        result = smb_receive(server, req);
-        if (result < 0)
-                return result;
-        if (req->rq_bytes_recvd < req->rq_rlen)
-                return 0;
-        VERBOSE("result: %d   smb_bcc:  %04x\n", result,
-                WVAL(req->rq_header, SMB_HEADER_LEN +
-                     (*(req->rq_header + smb_wct) * 2)));
-        result = 0;
-        req->rq_iov[0].iov_base = NULL;
-        req->rq_rlen = 0;
-        if (req->rq_callback)
-                req->rq_callback(req);
-        else if (req->rq_setup_read)
-                result = req->rq_setup_read(req);
-        if (result < 0) {
-                server->rstate = SMB_RECV_DROP;
-                return result;
-        }
-        server->rstate = req->rq_rlen > 0 ? SMB_RECV_DATA : SMB_RECV_END;
-        req->rq_bytes_recvd = 0;        // recvd out of the iov
-        VERBOSE("rlen: %d\n", req->rq_rlen);
-        if (req->rq_rlen < 0) {
-                PARANOIA("Parameters read beyond end of packet!\n");
-                server->rstate = SMB_RECV_END;
-                return -EIO;
-        }
-        return 0;
-}
-/*
- * Reads the SMB data
- */
-static int smb_recv_data(struct smb_sb_info *server, struct smb_request *req)
-{
-        int result;
-        result = smb_receive(server, req);
-        if (result < 0)
-                goto out;
-        if (req->rq_bytes_recvd < req->rq_rlen)
-                goto out;
-        server->rstate = SMB_RECV_END;
-out:
-        VERBOSE("result: %d\n", result);
-        return result;
-}
-/*
- * Receive a transaction2 response
- * Return: 0 if the response has been fully read
- *         1 if there are further "fragments" to read
- *        <0 if there is an error
- */
-static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req)
-{
-        unsigned char *inbuf;
-        unsigned int parm_disp, parm_offset, parm_count, parm_tot;
-        unsigned int data_disp, data_offset, data_count, data_tot;
-        int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
-        VERBOSE("handling trans2\n");
-        inbuf = req->rq_header;
-        data_tot    = WVAL(inbuf, smb_tdrcnt);
-        parm_tot    = WVAL(inbuf, smb_tprcnt);
-        parm_disp   = WVAL(inbuf, smb_prdisp);
-        parm_offset = WVAL(inbuf, smb_proff);
-        parm_count  = WVAL(inbuf, smb_prcnt);
-        data_disp   = WVAL(inbuf, smb_drdisp);
-        data_offset = WVAL(inbuf, smb_droff);
-        data_count  = WVAL(inbuf, smb_drcnt);
-        /* Modify offset for the split header/buffer we use */
-        if (data_count || data_offset) {
-                if (unlikely(data_offset < hdrlen))
-                        goto out_bad_data;
-                else
-                        data_offset -= hdrlen;
-        }
-        if (parm_count || parm_offset) {
-                if (unlikely(parm_offset < hdrlen))
-                        goto out_bad_parm;
-                else
-                        parm_offset -= hdrlen;
-        }
-        if (parm_count == parm_tot && data_count == data_tot) {
-                /*
-                 * This packet has all the trans2 data.
-                 *
-                 * We setup the request so that this will be the common
-                 * case. It may be a server error to not return a
-                 * response that fits.
-                 */
-                VERBOSE("single trans2 response  "
-                        "dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
-                        data_count, parm_count,
-                        data_offset, parm_offset);
-                req->rq_ldata = data_count;
-                req->rq_lparm = parm_count;
-                req->rq_data = req->rq_buffer + data_offset;
-                req->rq_parm = req->rq_buffer + parm_offset;
-                if (unlikely(parm_offset + parm_count > req->rq_rlen))
-                        goto out_bad_parm;
-                if (unlikely(data_offset + data_count > req->rq_rlen))
-                        goto out_bad_data;
-                return 0;
-        }
-        VERBOSE("multi trans2 response  "
-                "frag=%d, dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
-                req->rq_fragment,
-                data_count, parm_count,
-                data_offset, parm_offset);
-        if (!req->rq_fragment) {
-                int buf_len;
-                /* We got the first trans2 fragment */
-                req->rq_fragment = 1;
-                req->rq_total_data = data_tot;
-                req->rq_total_parm = parm_tot;
-                req->rq_ldata = 0;
-                req->rq_lparm = 0;
-                buf_len = data_tot + parm_tot;
-                if (buf_len > SMB_MAX_PACKET_SIZE)
-                        goto out_too_long;
-                req->rq_trans2bufsize = buf_len;
-                req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS);
-                if (!req->rq_trans2buffer)
-                        goto out_no_mem;
-                req->rq_parm = req->rq_trans2buffer;
-                req->rq_data = req->rq_trans2buffer + parm_tot;
-        } else if (unlikely(req->rq_total_data < data_tot ||
-                            req->rq_total_parm < parm_tot))
-                goto out_data_grew;
-        if (unlikely(parm_disp + parm_count > req->rq_total_parm ||
-                     parm_offset + parm_count > req->rq_rlen))
-                goto out_bad_parm;
-        if (unlikely(data_disp + data_count > req->rq_total_data ||
-                     data_offset + data_count > req->rq_rlen))
-                goto out_bad_data;
-        inbuf = req->rq_buffer;
-        memcpy(req->rq_parm + parm_disp, inbuf + parm_offset, parm_count);
-        memcpy(req->rq_data + data_disp, inbuf + data_offset, data_count);
-        req->rq_ldata += data_count;
-        req->rq_lparm += parm_count;
-        /*
-         * Check whether we've received all of the data. Note that
-         * we use the packet totals -- total lengths might shrink!
-         */
-        if (req->rq_ldata >= data_tot && req->rq_lparm >= parm_tot) {
-                req->rq_ldata = data_tot;
-                req->rq_lparm = parm_tot;
-                return 0;
-        }
-        return 1;
-out_too_long:
-        printk(KERN_ERR "smb_trans2: data/param too long, data=%u, parm=%u\n",
-                data_tot, parm_tot);
-        goto out_EIO;
-out_no_mem:
-        printk(KERN_ERR "smb_trans2: couldn't allocate data area of %d bytes\n",
-               req->rq_trans2bufsize);
-        req->rq_errno = -ENOMEM;
-        goto out;
-out_data_grew:
-        printk(KERN_ERR "smb_trans2: data/params grew!\n");
-        goto out_EIO;
-out_bad_parm:
-        printk(KERN_ERR "smb_trans2: invalid parms, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
-               parm_disp, parm_count, parm_tot, parm_offset);
-        goto out_EIO;
-out_bad_data:
-        printk(KERN_ERR "smb_trans2: invalid data, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
-               data_disp, data_count, data_tot, data_offset);
-out_EIO:
-        req->rq_errno = -EIO;
-out:
-        return req->rq_errno;
-}
-/*
- * State machine for receiving responses. We handle the fact that we can't
- * read the full response in one try by having states telling us how much we
- * have read.
- *
- * Must be called with the server lock held (only called from smbiod).
- *
- * Return: <0 on error
- */
-int smb_request_recv(struct smb_sb_info *server)
-{
-        struct smb_request *req = NULL;
-        int result = 0;
-        if (smb_recv_available(server) <= 0)
-                return 0;
-        VERBOSE("state: %d\n", server->rstate);
-        switch (server->rstate) {
-        case SMB_RECV_DROP:
-                result = smb_receive_drop(server);
-                if (result < 0)
-                        break;
-                if (server->rstate == SMB_RECV_DROP)
-                        break;
-                server->rstate = SMB_RECV_START;
-                /* fallthrough */
-        case SMB_RECV_START:
-                server->smb_read = 0;
-                server->rstate = SMB_RECV_HEADER;
-                /* fallthrough */
-        case SMB_RECV_HEADER:
-                result = smb_receive_header(server);
-                if (result < 0)
-                        break;
-                if (server->rstate == SMB_RECV_HEADER)
-                        break;
-                if (! (*(server->header + smb_flg) & SMB_FLAGS_REPLY) ) {
-                        server->rstate = SMB_RECV_REQUEST;
-                        break;
-                }
-                if (server->rstate != SMB_RECV_HCOMPLETE)
-                        break;
-                /* fallthrough */
-        case SMB_RECV_HCOMPLETE:
-                req = find_request(server, WVAL(server->header, smb_mid));
-                if (!req)
-                        break;
-                smb_init_request(server, req);
-                req->rq_rcls = *(req->rq_header + smb_rcls);
-                req->rq_err  = WVAL(req->rq_header, smb_err);
-                if (server->rstate != SMB_RECV_PARAM)
-                        break;
-                /* fallthrough */
-        case SMB_RECV_PARAM:
-                if (!req)
-                        req = find_request(server,WVAL(server->header,smb_mid));
-                if (!req)
-                        break;
-                result = smb_recv_param(server, req);
-                if (result < 0)
-                        break;
-                if (server->rstate != SMB_RECV_DATA)
-                        break;
-                /* fallthrough */
-        case SMB_RECV_DATA:
-                if (!req)
-                        req = find_request(server,WVAL(server->header,smb_mid));
-                if (!req)
-                        break;
-                result = smb_recv_data(server, req);
-                if (result < 0)
-                        break;
-                break;
-                /* We should never be called with any of these states */
-        case SMB_RECV_END:
-        case SMB_RECV_REQUEST:
-                BUG();
-        }
-        if (result < 0) {
-                /* We saw an error */
-                return result;
-        }
-        if (server->rstate != SMB_RECV_END)
-                return 0;
-        result = 0;
-        if (req->rq_trans2_command && req->rq_rcls == SUCCESS)
-                result = smb_recv_trans2(server, req);
-        /*
-         * Response completely read. Drop any extra bytes sent by the server.
-         * (Yes, servers sometimes add extra bytes to responses)
-         */
-        VERBOSE("smb_len: %d   smb_read: %d\n",
-                server->smb_len, server->smb_read);
-        if (server->smb_read < server->smb_len)
-                smb_receive_drop(server);
-        server->rstate = SMB_RECV_START;
-        if (!result) {
-                list_del_init(&req->rq_queue);
-                req->rq_flags |= SMB_REQ_RECEIVED;
-                smb_rput(req);
-                wake_up_interruptible(&req->rq_wait);
-        }
-        return 0;
-}
diff --git a/fs/smbfs/request.h b/fs/smbfs/request.h
deleted file mode 100644
index efb21451e7c9..000000000000
--- a/fs/smbfs/request.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/uio.h>
-#include <linux/wait.h>
-struct smb_request {
-        struct list_head rq_queue;      /* recvq or xmitq for the server */
-        atomic_t rq_count;
-        wait_queue_head_t rq_wait;
-        int rq_flags;
-        int rq_mid;     /* multiplex ID, set by request.c */
-        struct smb_sb_info *rq_server;
-        /* header + word count + parameter words + byte count */
-        unsigned char rq_header[SMB_HEADER_LEN + 20*2 + 2];
-        int rq_bufsize;
-        unsigned char *rq_buffer;
-        /* FIXME: this is not good enough for merging IO requests. */
-        unsigned char *rq_page;
-        int rq_rsize;
-        int rq_resp_wct;
-        int rq_resp_bcc;
-        int rq_rlen;
-        int rq_bytes_recvd;
-        int rq_slen;
-        int rq_bytes_sent;
-        int rq_iovlen;
-        struct kvec rq_iov[4];
-        int (*rq_setup_read) (struct smb_request *);
-        void (*rq_callback) (struct smb_request *);
-        /* ------ trans2 stuff ------ */
-        u16 rq_trans2_command;  /* 0 if not a trans2 request */
-        unsigned int rq_ldata;
-        unsigned char *rq_data;
-        unsigned int rq_lparm;
-        unsigned char *rq_parm;
-        int rq_fragment;
-        u32 rq_total_data;
-        u32 rq_total_parm;
-        int rq_trans2bufsize;
-        unsigned char *rq_trans2buffer;
-        /* ------ response ------ */
-        unsigned short rq_rcls;
-        unsigned short rq_err;
-        int rq_errno;
-};
-#define SMB_REQ_STATIC          0x0001  /* rq_buffer is static */
-#define SMB_REQ_NORETRY         0x0002  /* request is invalid after retry */
-#define SMB_REQ_TRANSMITTED     0x4000  /* all data has been sent */
-#define SMB_REQ_RECEIVED        0x8000  /* reply received, smbiod is done */
-#define xSMB_REQ_NOREPLY        0x0004  /* we don't want the reply (if any) */
-#define xSMB_REQ_NORECEIVER     0x0008  /* caller doesn't wait for response */
diff --git a/fs/smbfs/smb_debug.h b/fs/smbfs/smb_debug.h
deleted file mode 100644
index fc4b1a5dd755..000000000000
--- a/fs/smbfs/smb_debug.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Defines some debug macros for smbfs.
- */
-/* This makes a dentry parent/child name pair. Useful for debugging printk's */
-#define DENTRY_PATH(dentry) \
-        (dentry)->d_parent->d_name.name,(dentry)->d_name.name
-/*
- * safety checks that should never happen ???
- * these are normally enabled.
- */
-#ifdef SMBFS_PARANOIA
-# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a)
-#else
-# define PARANOIA(f, a...) do { ; } while(0)
-#endif
-/* lots of debug messages */
-#ifdef SMBFS_DEBUG_VERBOSE
-# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
-#else
-# define VERBOSE(f, a...) do { ; } while(0)
-#endif
-/*
- * "normal" debug messages, but not with a normal DEBUG define ... way
- * too common name.
- */
-#ifdef SMBFS_DEBUG
-#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
-#else
-#define DEBUG1(f, a...) do { ; } while(0)
-#endif
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
deleted file mode 100644
index 0e39a924f10a..000000000000
--- a/fs/smbfs/smbiod.c
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- *  smbiod.c
- *
- *  Copyright (C) 2000, Charles Loep / Corel Corp.
- *  Copyright (C) 2001, Urban Widmark
- */
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/file.h>
-#include <linux/dcache.h>
-#include <linux/module.h>
-#include <linux/net.h>
-#include <linux/kthread.h>
-#include <net/ip.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include "smb_debug.h"
-#include "request.h"
-#include "proto.h"
-enum smbiod_state {
-        SMBIOD_DEAD,
-        SMBIOD_STARTING,
-        SMBIOD_RUNNING,
-};
-static enum smbiod_state smbiod_state = SMBIOD_DEAD;
-static struct task_struct *smbiod_thread;
-static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
-static LIST_HEAD(smb_servers);
-static DEFINE_SPINLOCK(servers_lock);
-#define SMBIOD_DATA_READY       (1<<0)
-static unsigned long smbiod_flags;
-static int smbiod(void *);
-static int smbiod_start(void);
-/*
- * called when there's work for us to do
- */
-void smbiod_wake_up(void)
-{
-        if (smbiod_state == SMBIOD_DEAD)
-                return;
-        set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-        wake_up_interruptible(&smbiod_wait);
-}
-/*
- * start smbiod if none is running
- */
-static int smbiod_start(void)
-{
-        struct task_struct *tsk;
-        int err = 0;
-        if (smbiod_state != SMBIOD_DEAD)
-                return 0;
-        smbiod_state = SMBIOD_STARTING;
-        __module_get(THIS_MODULE);
-        spin_unlock(&servers_lock);
-        tsk = kthread_run(smbiod, NULL, "smbiod");
-        if (IS_ERR(tsk)) {
-                err = PTR_ERR(tsk);
-                module_put(THIS_MODULE);
-        }
-        spin_lock(&servers_lock);
-        if (err < 0) {
-                smbiod_state = SMBIOD_DEAD;
-                smbiod_thread = NULL;
-        } else {
-                smbiod_state = SMBIOD_RUNNING;
-                smbiod_thread = tsk;
-        }
-        return err;
-}
-/*
- * register a server & start smbiod if necessary
- */
-int smbiod_register_server(struct smb_sb_info *server)
-{
-        int ret;
-        spin_lock(&servers_lock);
-        list_add(&server->entry, &smb_servers);
-        VERBOSE("%p\n", server);
-        ret = smbiod_start();
-        spin_unlock(&servers_lock);
-        return ret;
-}
-/*
- * Unregister a server
- * Must be called with the server lock held.
- */
-void smbiod_unregister_server(struct smb_sb_info *server)
-{
-        spin_lock(&servers_lock);
-        list_del_init(&server->entry);
-        VERBOSE("%p\n", server);
-        spin_unlock(&servers_lock);
-        smbiod_wake_up();
-        smbiod_flush(server);
-}
-void smbiod_flush(struct smb_sb_info *server)
-{
-        struct list_head *tmp, *n;
-        struct smb_request *req;
-        list_for_each_safe(tmp, n, &server->xmitq) {
-                req = list_entry(tmp, struct smb_request, rq_queue);
-                req->rq_errno = -EIO;
-                list_del_init(&req->rq_queue);
-                smb_rput(req);
-                wake_up_interruptible(&req->rq_wait);
-        }
-        list_for_each_safe(tmp, n, &server->recvq) {
-                req = list_entry(tmp, struct smb_request, rq_queue);
-                req->rq_errno = -EIO;
-                list_del_init(&req->rq_queue);
-                smb_rput(req);
-                wake_up_interruptible(&req->rq_wait);
-        }
-}
-/*
- * Wake up smbmount and make it reconnect to the server.
- * This must be called with the server locked.
- *
- * FIXME: add smbconnect version to this
- */
-int smbiod_retry(struct smb_sb_info *server)
-{
-        struct list_head *head;
-        struct smb_request *req;
-        struct pid *pid = get_pid(server->conn_pid);
-        int result = 0;
-        VERBOSE("state: %d\n", server->state);
-        if (server->state == CONN_VALID || server->state == CONN_RETRYING)
-                goto out;
-        smb_invalidate_inodes(server);
-        /*
-         * Some requests are meaningless after a retry, so we abort them.
-         * One example are all requests using 'fileid' since the files are
-         * closed on retry.
-         */
-        head = server->xmitq.next;
-        while (head != &server->xmitq) {
-                req = list_entry(head, struct smb_request, rq_queue);
-                head = head->next;
-                req->rq_bytes_sent = 0;
-                if (req->rq_flags & SMB_REQ_NORETRY) {
-                        VERBOSE("aborting request %p on xmitq\n", req);
-                        req->rq_errno = -EIO;
-                        list_del_init(&req->rq_queue);
-                        smb_rput(req);
-                        wake_up_interruptible(&req->rq_wait);
-                }
-        }
-        /*
-         * FIXME: test the code for retrying request we already sent
-         */
-        head = server->recvq.next;
-        while (head != &server->recvq) {
-                req = list_entry(head, struct smb_request, rq_queue);
-                head = head->next;
-#if 0
-                if (req->rq_flags & SMB_REQ_RETRY) {
-                        /* must move the request to the xmitq */
-                        VERBOSE("retrying request %p on recvq\n", req);
-                        list_move(&req->rq_queue, &server->xmitq);
-                        continue;
-                }
-#endif
-                VERBOSE("aborting request %p on recvq\n", req);
-                /* req->rq_rcls = ???; */ /* FIXME: set smb error code too? */
-                req->rq_errno = -EIO;
-                list_del_init(&req->rq_queue);
-                smb_rput(req);
-                wake_up_interruptible(&req->rq_wait);
-        }
-        smb_close_socket(server);
-        if (!pid) {
-                /* FIXME: this is fatal, umount? */
-                printk(KERN_ERR "smb_retry: no connection process\n");
-                server->state = CONN_RETRIED;
-                goto out;
-        }
-        /*
-         * Change state so that only one retry per server will be started.
-         */
-        server->state = CONN_RETRYING;
-        /*
-         * Note: use the "priv" flag, as a user process may need to reconnect.
-         */
-        result = kill_pid(pid, SIGUSR1, 1);
-        if (result) {
-                /* FIXME: this is most likely fatal, umount? */
-                printk(KERN_ERR "smb_retry: signal failed [%d]\n", result);
-                goto out;
-        }
-        VERBOSE("signalled pid %d\n", pid_nr(pid));
-        /* FIXME: The retried requests should perhaps get a "time boost". */
-out:
-        put_pid(pid);
-        return result;
-}
-/*
- * Currently handles lockingX packets.
- */
-static void smbiod_handle_request(struct smb_sb_info *server)
-{
-        PARANOIA("smbiod got a request ... and we don't implement oplocks!\n");
-        server->rstate = SMB_RECV_DROP;
-}
-/*
- * Do some IO for one server.
- */
-static void smbiod_doio(struct smb_sb_info *server)
-{
-        int result;
-        int maxwork = 7;
-        if (server->state != CONN_VALID)
-                goto out;
-        do {
-                result = smb_request_recv(server);
-                if (result < 0) {
-                        server->state = CONN_INVALID;
-                        smbiod_retry(server);
-                        goto out;       /* reconnecting is slow */
-                } else if (server->rstate == SMB_RECV_REQUEST)
-                        smbiod_handle_request(server);
-        } while (result > 0 && maxwork-- > 0);
-        /*
-         * If there is more to read then we want to be sure to wake up again.
-         */
-        if (server->state != CONN_VALID)
-                goto out;
-        if (smb_recv_available(server) > 0)
-                set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-        do {
-                result = smb_request_send_server(server);
-                if (result < 0) {
-                        server->state = CONN_INVALID;
-                        smbiod_retry(server);
-                        goto out;       /* reconnecting is slow */
-                }
-        } while (result > 0);
-        /*
-         * If the last request was not sent out we want to wake up again.
-         */
-        if (!list_empty(&server->xmitq))
-                set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-out:
-        return;
-}
-/*
- * smbiod kernel thread
- */
-static int smbiod(void *unused)
-{
-        VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
-        for (;;) {
-                struct smb_sb_info *server;
-                struct list_head *pos, *n;
-                /* FIXME: Use poll? */
-                wait_event_interruptible(smbiod_wait,
-                         test_bit(SMBIOD_DATA_READY, &smbiod_flags));
-                if (signal_pending(current)) {
-                        spin_lock(&servers_lock);
-                        smbiod_state = SMBIOD_DEAD;
-                        spin_unlock(&servers_lock);
-                        break;
-                }
-                clear_bit(SMBIOD_DATA_READY, &smbiod_flags);
-                spin_lock(&servers_lock);
-                if (list_empty(&smb_servers)) {
-                        smbiod_state = SMBIOD_DEAD;
-                        spin_unlock(&servers_lock);
-                        break;
-                }
-                list_for_each_safe(pos, n, &smb_servers) {
-                        server = list_entry(pos, struct smb_sb_info, entry);
-                        VERBOSE("checking server %p\n", server);
-                        if (server->state == CONN_VALID) {
-                                spin_unlock(&servers_lock);
-                                smb_lock_server(server);
-                                smbiod_doio(server);
-                                smb_unlock_server(server);
-                                spin_lock(&servers_lock);
-                        }
-                }
-                spin_unlock(&servers_lock);
-        }
-        VERBOSE("SMB Kernel thread exiting (%d) ...\n", current->pid);
-        module_put_and_exit(0);
-}
diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c
deleted file mode 100644
index e37fe4deebd0..000000000000
--- a/fs/smbfs/sock.c
+++ /dev/null
@@ -1,386 +0,0 @@
-/*
- *  sock.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/socket.h>
-#include <linux/fcntl.h>
-#include <linux/file.h>
-#include <linux/in.h>
-#include <linux/net.h>
-#include <linux/mm.h>
-#include <linux/netdevice.h>
-#include <linux/workqueue.h>
-#include <net/scm.h>
-#include <net/tcp_states.h>
-#include <net/ip.h>
-#include <linux/smb_fs.h>
-#include <linux/smb.h>
-#include <linux/smbno.h>
-#include <asm/uaccess.h>
-#include <asm/ioctls.h>
-#include "smb_debug.h"
-#include "proto.h"
-#include "request.h"
-static int
-_recvfrom(struct socket *socket, unsigned char *ubuf, int size, unsigned flags)
-{
-        struct kvec iov = {ubuf, size};
-        struct msghdr msg = {.msg_flags = flags};
-        msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL;
-        return kernel_recvmsg(socket, &msg, &iov, 1, size, msg.msg_flags);
-}
-/*
- * Return the server this socket belongs to
- */
-static struct smb_sb_info *
-server_from_socket(struct socket *socket)
-{
-        return socket->sk->sk_user_data;
-}
-/*
- * Called when there is data on the socket.
- */
-void
-smb_data_ready(struct sock *sk, int len)
-{
-        struct smb_sb_info *server = server_from_socket(sk->sk_socket);
-        void (*data_ready)(struct sock *, int) = server->data_ready;
-        data_ready(sk, len);
-        VERBOSE("(%p, %d)\n", sk, len);
-        smbiod_wake_up();
-}
-int
-smb_valid_socket(struct inode * inode)
-{
-        return (inode && S_ISSOCK(inode->i_mode) && 
-                SOCKET_I(inode)->type == SOCK_STREAM);
-}
-static struct socket *
-server_sock(struct smb_sb_info *server)
-{
-        struct file *file;
-        if (server && (file = server->sock_file))
-        {
-#ifdef SMBFS_PARANOIA
-                if (!smb_valid_socket(file->f_path.dentry->d_inode))
-                        PARANOIA("bad socket!\n");
-#endif
-                return SOCKET_I(file->f_path.dentry->d_inode);
-        }
-        return NULL;
-}
-void
-smb_close_socket(struct smb_sb_info *server)
-{
-        struct file * file = server->sock_file;
-        if (file) {
-                struct socket *sock = server_sock(server);
-                VERBOSE("closing socket %p\n", sock);
-                sock->sk->sk_data_ready = server->data_ready;
-                server->sock_file = NULL;
-                fput(file);
-        }
-}
-static int
-smb_get_length(struct socket *socket, unsigned char *header)
-{
-        int result;
-        result = _recvfrom(socket, header, 4, MSG_PEEK);
-        if (result == -EAGAIN)
-                return -ENODATA;
-        if (result < 0) {
-                PARANOIA("recv error = %d\n", -result);
-                return result;
-        }
-        if (result < 4)
-                return -ENODATA;
-        switch (header[0]) {
-        case 0x00:
-        case 0x82:
-                break;
-        case 0x85:
-                DEBUG1("Got SESSION KEEP ALIVE\n");
-                _recvfrom(socket, header, 4, 0);        /* read away */
-                return -ENODATA;
-        default:
-                PARANOIA("Invalid NBT packet, code=%x\n", header[0]);
-                return -EIO;
-        }
-        /* The length in the RFC NB header is the raw data length */
-        return smb_len(header);
-}
-int
-smb_recv_available(struct smb_sb_info *server)
-{
-        mm_segment_t oldfs;
-        int avail, err;
-        struct socket *sock = server_sock(server);
-        oldfs = get_fs();
-        set_fs(get_ds());
-        err = sock->ops->ioctl(sock, SIOCINQ, (unsigned long) &avail);
-        set_fs(oldfs);
-        return (err >= 0) ? avail : err;
-}
-/*
- * Adjust the kvec to move on 'n' bytes (from nfs/sunrpc)
- */
-static int
-smb_move_iov(struct kvec **data, size_t *num, struct kvec *vec, unsigned amount)
-{
-        struct kvec *iv = *data;
-        int i;
-        int len;
-        /*
-         *      Eat any sent kvecs
-         */
-        while (iv->iov_len <= amount) {
-                amount -= iv->iov_len;
-                iv++;
-                (*num)--;
-        }
-        /*
-         *      And chew down the partial one
-         */
-        vec[0].iov_len = iv->iov_len-amount;
-        vec[0].iov_base =((unsigned char *)iv->iov_base)+amount;
-        iv++;
-        len = vec[0].iov_len;
-        /*
-         *      And copy any others
-         */
-        for (i = 1; i < *num; i++) {
-                vec[i] = *iv++;
-                len += vec[i].iov_len;
-        }
-        *data = vec;
-        return len;
-}
-/*
- * smb_receive_header
- * Only called by the smbiod thread.
- */
-int
-smb_receive_header(struct smb_sb_info *server)
-{
-        struct socket *sock;
-        int result = 0;
-        unsigned char peek_buf[4];
-        result = -EIO; 
-        sock = server_sock(server);
-        if (!sock)
-                goto out;
-        if (sock->sk->sk_state != TCP_ESTABLISHED)
-                goto out;
-        if (!server->smb_read) {
-                result = smb_get_length(sock, peek_buf);
-                if (result < 0) {
-                        if (result == -ENODATA)
-                                result = 0;
-                        goto out;
-                }
-                server->smb_len = result + 4;
-                if (server->smb_len < SMB_HEADER_LEN) {
-                        PARANOIA("short packet: %d\n", result);
-                        server->rstate = SMB_RECV_DROP;
-                        result = -EIO;
-                        goto out;
-                }
-                if (server->smb_len > SMB_MAX_PACKET_SIZE) {
-                        PARANOIA("long packet: %d\n", result);
-                        server->rstate = SMB_RECV_DROP;
-                        result = -EIO;
-                        goto out;
-                }
-        }
-        result = _recvfrom(sock, server->header + server->smb_read,
-                           SMB_HEADER_LEN - server->smb_read, 0);
-        VERBOSE("_recvfrom: %d\n", result);
-        if (result < 0) {
-                VERBOSE("receive error: %d\n", result);
-                goto out;
-        }
-        server->smb_read += result;
-        if (server->smb_read == SMB_HEADER_LEN)
-                server->rstate = SMB_RECV_HCOMPLETE;
-out:
-        return result;
-}
-static char drop_buffer[PAGE_SIZE];
-/*
- * smb_receive_drop - read and throw away the data
- * Only called by the smbiod thread.
- *
- * FIXME: we are in the kernel, could we just tell the socket that we want
- * to drop stuff from the buffer?
- */
-int
-smb_receive_drop(struct smb_sb_info *server)
-{
-        struct socket *sock;
-        unsigned int flags;
-        struct kvec iov;
-        struct msghdr msg;
-        int rlen = smb_len(server->header) - server->smb_read + 4;
-        int result = -EIO;
-        if (rlen > PAGE_SIZE)
-                rlen = PAGE_SIZE;
-        sock = server_sock(server);
-        if (!sock)
-                goto out;
-        if (sock->sk->sk_state != TCP_ESTABLISHED)
-                goto out;
-        flags = MSG_DONTWAIT | MSG_NOSIGNAL;
-        iov.iov_base = drop_buffer;
-        iov.iov_len = PAGE_SIZE;
-        msg.msg_flags = flags;
-        msg.msg_name = NULL;
-        msg.msg_namelen = 0;
-        msg.msg_control = NULL;
-        result = kernel_recvmsg(sock, &msg, &iov, 1, rlen, flags);
-        VERBOSE("read: %d\n", result);
-        if (result < 0) {
-                VERBOSE("receive error: %d\n", result);
-                goto out;
-        }
-        server->smb_read += result;
-        if (server->smb_read >= server->smb_len)
-                server->rstate = SMB_RECV_END;
-out:
-        return result;
-}
-/*
- * smb_receive
- * Only called by the smbiod thread.
- */
-int
-smb_receive(struct smb_sb_info *server, struct smb_request *req)
-{
-        struct socket *sock;
-        unsigned int flags;
-        struct kvec iov[4];
-        struct kvec *p = req->rq_iov;
-        size_t num = req->rq_iovlen;
-        struct msghdr msg;
-        int rlen;
-        int result = -EIO;
-        sock = server_sock(server);
-        if (!sock)
-                goto out;
-        if (sock->sk->sk_state != TCP_ESTABLISHED)
-                goto out;
-        flags = MSG_DONTWAIT | MSG_NOSIGNAL;
-        msg.msg_flags = flags;
-        msg.msg_name = NULL;
-        msg.msg_namelen = 0;
-        msg.msg_control = NULL;
-        /* Dont repeat bytes and count available bufferspace */
-        rlen = min_t(int, smb_move_iov(&p, &num, iov, req->rq_bytes_recvd),
-                        (req->rq_rlen - req->rq_bytes_recvd));
-        result = kernel_recvmsg(sock, &msg, p, num, rlen, flags);
-        VERBOSE("read: %d\n", result);
-        if (result < 0) {
-                VERBOSE("receive error: %d\n", result);
-                goto out;
-        }
-        req->rq_bytes_recvd += result;
-        server->smb_read += result;
-out:
-        return result;
-}
-/*
- * Try to send a SMB request. This may return after sending only parts of the
- * request. SMB_REQ_TRANSMITTED will be set if a request was fully sent.
- *
- * Parts of this was taken from xprt_sendmsg from net/sunrpc/xprt.c
- */
-int
-smb_send_request(struct smb_request *req)
-{
-        struct smb_sb_info *server = req->rq_server;
-        struct socket *sock;
-        struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
-        int slen = req->rq_slen - req->rq_bytes_sent;
-        int result = -EIO;
-        struct kvec iov[4];
-        struct kvec *p = req->rq_iov;
-        size_t num = req->rq_iovlen;
-        sock = server_sock(server);
-        if (!sock)
-                goto out;
-        if (sock->sk->sk_state != TCP_ESTABLISHED)
-                goto out;
-        /* Dont repeat bytes */
-        if (req->rq_bytes_sent)
-                smb_move_iov(&p, &num, iov, req->rq_bytes_sent);
-        result = kernel_sendmsg(sock, &msg, p, num, slen);
-        if (result >= 0) {
-                req->rq_bytes_sent += result;
-                if (req->rq_bytes_sent >= req->rq_slen)
-                        req->rq_flags |= SMB_REQ_TRANSMITTED;
-        }
-out:
-        return result;
-}
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
deleted file mode 100644
index 00b2909bd469..000000000000
--- a/fs/smbfs/symlink.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  symlink.c
- *
- *  Copyright (C) 2002 by John Newbigin
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/net.h>
-#include <linux/namei.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/smbno.h>
-#include <linux/smb_fs.h>
-#include "smb_debug.h"
-#include "proto.h"
-int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname)
-{
-        DEBUG1("create symlink %s -> %s/%s\n", oldname, DENTRY_PATH(dentry));
-        return smb_proc_symlink(server_from_dentry(dentry), dentry, oldname);
-}
-static void *smb_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-        char *link = __getname();
-        DEBUG1("followlink of %s/%s\n", DENTRY_PATH(dentry));
-        if (!link) {
-                link = ERR_PTR(-ENOMEM);
-        } else {
-                int len = smb_proc_read_link(server_from_dentry(dentry),
-                                                dentry, link, PATH_MAX - 1);
-                if (len < 0) {
-                        __putname(link);
-                        link = ERR_PTR(len);
-                } else {
-                        link[len] = 0;
-                }
-        }
-        nd_set_link(nd, link);
-        return NULL;
-}
-static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
-{
-        char *s = nd_get_link(nd);
-        if (!IS_ERR(s))
-                __putname(s);
-}
-const struct inode_operations smb_link_inode_operations =
-{
-        .readlink       = generic_readlink,
-        .follow_link    = smb_follow_link,
-        .put_link       = smb_put_link,
-};
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 07a4f1156048..24de30ba34c1 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -370,12 +370,10 @@ static void squashfs_put_super(struct super_block *sb)
 }
-static int squashfs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags,
-                                const char *dev_name, void *data,
+                                const char *dev_name, void *data)
-                                struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
-                                mnt);
 }
@@ -451,7 +449,7 @@ static void squashfs_destroy_inode(struct inode *inode)
 static struct file_system_type squashfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "squashfs",
-        .get_sb = squashfs_get_sb,
+        .mount = squashfs_mount,
        .kill_sb = kill_block_super,
        .fs_flags = FS_REQUIRES_DEV
 };
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 652b8541f9c6..3876c36699a1 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -158,17 +158,18 @@ static int squashfs_xattr_get(struct inode *inode, int name_index,
                                        strncmp(target, name, name_size) == 0) {
                        /* found xattr */
                        if (type & SQUASHFS_XATTR_VALUE_OOL) {
-                                __le64 xattr;
+                                __le64 xattr_val;
+                                u64 xattr;
                                /* val is a reference to the real location */
                                err = squashfs_read_metadata(sb, &val, &start,
                                                &offset, sizeof(val));
                                if (err < 0)
                                        goto failed;
-                                err = squashfs_read_metadata(sb, &xattr, &start,
+                                err = squashfs_read_metadata(sb, &xattr_val,
-                                         &offset, sizeof(xattr));
+                                        &start, &offset, sizeof(xattr_val));
                                if (err < 0)
                                        goto failed;
-                                xattr = le64_to_cpu(xattr);
+                                xattr = le64_to_cpu(xattr_val);
                                start = SQUASHFS_XATTR_BLK(xattr) +
                                                        msblk->xattr_table;
                                offset = SQUASHFS_XATTR_OFFSET(xattr);
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index 49fe0d719fbf..b634efce4bde 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -25,7 +25,7 @@
 extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
                u64 *, int *);
 extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
-                int *, unsigned long long *);
+                unsigned int *, unsigned long long *);
 #else
 static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
                u64 start, u64 *xattr_table_start, int *xattr_ids)
@@ -35,7 +35,7 @@ static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
 }
 static inline int squashfs_xattr_lookup(struct super_block *sb,
-                unsigned int index, int *count, int *size,
+                unsigned int index, int *count, unsigned int *size,
                unsigned long long *xattr)
 {
        return 0;
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index cfb41106098f..d33be5dd6c32 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -34,6 +34,7 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 /*
 * Map xattr id using the xattr id look up table
diff --git a/fs/super.c b/fs/super.c
index 8819e3a7ff20..ca696155cd9a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -273,14 +273,14 @@ void generic_shutdown_super(struct super_block *sb)
                get_fs_excl();
                sb->s_flags &= ~MS_ACTIVE;
-                /* bad name - it should be evict_inodes() */
+                fsnotify_unmount_inodes(&sb->s_inodes);
-                invalidate_inodes(sb);
+                evict_inodes(sb);
                if (sop->put_super)
                        sop->put_super(sb);
-                /* Forget any remaining inodes */
+                if (!list_empty(&sb->s_inodes)) {
-                if (invalidate_inodes(sb)) {
                        printk("VFS: Busy inodes after unmount of %s. "
                           "Self-destruct in 5 seconds.  Have a nice day...\n",
                           sb->s_id);
@@ -715,15 +715,14 @@ static int ns_set_super(struct super_block *sb, void *data)
        return set_anon_super(sb, NULL);
 }
-int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
+struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
-        int (*fill_super)(struct super_block *, void *, int),
+        void *data, int (*fill_super)(struct super_block *, void *, int))
-        struct vfsmount *mnt)
 {
        struct super_block *sb;
        sb = sget(fs_type, ns_test_super, ns_set_super, data);
        if (IS_ERR(sb))
-                return PTR_ERR(sb);
+                return ERR_CAST(sb);
        if (!sb->s_root) {
                int err;
@@ -731,17 +730,16 @@ int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
                err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
                if (err) {
                        deactivate_locked_super(sb);
-                        return err;
+                        return ERR_PTR(err);
                }
                sb->s_flags |= MS_ACTIVE;
        }
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 }
-EXPORT_SYMBOL(get_sb_ns);
+EXPORT_SYMBOL(mount_ns);
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
@@ -762,10 +760,9 @@ static int test_bdev_super(struct super_block *s, void *data)
        return (void *)s->s_bdev == data;
 }
-int get_sb_bdev(struct file_system_type *fs_type,
+struct dentry *mount_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
+        int (*fill_super)(struct super_block *, void *, int))
-        struct vfsmount *mnt)
 {
        struct block_device *bdev;
        struct super_block *s;
@@ -777,7 +774,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
        bdev = open_bdev_exclusive(dev_name, mode, fs_type);
        if (IS_ERR(bdev))
-                return PTR_ERR(bdev);
+                return ERR_CAST(bdev);
        /*
         * once the super is inserted into the list by sget, s_umount
@@ -829,15 +826,30 @@ int get_sb_bdev(struct file_system_type *fs_type,
                bdev->bd_super = s;
        }
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 error_s:
        error = PTR_ERR(s);
 error_bdev:
        close_bdev_exclusive(bdev, mode);
 error:
-        return error;
+        return ERR_PTR(error);
+}
+EXPORT_SYMBOL(mount_bdev);
+int get_sb_bdev(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data,
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
+{
+        struct dentry *root;
+        root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
+        if (IS_ERR(root))
+                return PTR_ERR(root);
+        mnt->mnt_root = root;
+        mnt->mnt_sb = root->d_sb;
+        return 0;
 }
 EXPORT_SYMBOL(get_sb_bdev);
@@ -856,29 +868,42 @@ void kill_block_super(struct super_block *sb)
 EXPORT_SYMBOL(kill_block_super);
 #endif
-int get_sb_nodev(struct file_system_type *fs_type,
+struct dentry *mount_nodev(struct file_system_type *fs_type,
        int flags, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
+        int (*fill_super)(struct super_block *, void *, int))
-        struct vfsmount *mnt)
 {
        int error;
        struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
        if (IS_ERR(s))
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        s->s_flags = flags;
        error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
        if (error) {
                deactivate_locked_super(s);
-                return error;
+                return ERR_PTR(error);
        }
        s->s_flags |= MS_ACTIVE;
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 }
+EXPORT_SYMBOL(mount_nodev);
+int get_sb_nodev(struct file_system_type *fs_type,
+        int flags, void *data,
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
+{
+        struct dentry *root;
+        root = mount_nodev(fs_type, flags, data, fill_super);
+        if (IS_ERR(root))
+                return PTR_ERR(root);
+        mnt->mnt_root = root;
+        mnt->mnt_sb = root->d_sb;
+        return 0;
+}
 EXPORT_SYMBOL(get_sb_nodev);
 static int compare_single(struct super_block *s, void *p)
@@ -886,29 +911,42 @@ static int compare_single(struct super_block *s, void *p)
        return 1;
 }
-int get_sb_single(struct file_system_type *fs_type,
+struct dentry *mount_single(struct file_system_type *fs_type,
        int flags, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
+        int (*fill_super)(struct super_block *, void *, int))
-        struct vfsmount *mnt)
 {
        struct super_block *s;
        int error;
        s = sget(fs_type, compare_single, set_anon_super, NULL);
        if (IS_ERR(s))
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        if (!s->s_root) {
                s->s_flags = flags;
                error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
                if (error) {
                        deactivate_locked_super(s);
-                        return error;
+                        return ERR_PTR(error);
                }
                s->s_flags |= MS_ACTIVE;
        } else {
                do_remount_sb(s, flags, data, 0);
        }
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
+}
+EXPORT_SYMBOL(mount_single);
+int get_sb_single(struct file_system_type *fs_type,
+        int flags, void *data,
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
+{
+        struct dentry *root;
+        root = mount_single(fs_type, flags, data, fill_super);
+        if (IS_ERR(root))
+                return PTR_ERR(root);
+        mnt->mnt_root = root;
+        mnt->mnt_sb = root->d_sb;
        return 0;
 }
@@ -918,6 +956,7 @@ struct vfsmount *
 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
        struct vfsmount *mnt;
+        struct dentry *root;
        char *secdata = NULL;
        int error;
@@ -942,9 +981,19 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
                        goto out_free_secdata;
        }
-        error = type->get_sb(type, flags, name, data, mnt);
+        if (type->mount) {
-        if (error < 0)
+                root = type->mount(type, flags, name, data);
-                goto out_free_secdata;
+                if (IS_ERR(root)) {
+                        error = PTR_ERR(root);
+                        goto out_free_secdata;
+                }
+                mnt->mnt_root = root;
+                mnt->mnt_sb = root->d_sb;
+        } else {
+                error = type->get_sb(type, flags, name, data, mnt);
+                if (error < 0)
+                        goto out_free_secdata;
+        }
        BUG_ON(!mnt->mnt_sb);
        WARN_ON(!mnt->mnt_sb->s_bdi);
        mnt->mnt_sb->s_flags |= MS_BORN;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f2af22574c50..266895783b47 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -23,7 +23,7 @@
 #include "sysfs.h"
-static struct vfsmount *sysfs_mount;
+static struct vfsmount *sysfs_mnt;
 struct kmem_cache *sysfs_dir_cachep;
 static const struct super_operations sysfs_ops = {
@@ -95,18 +95,17 @@ static int sysfs_set_super(struct super_block *sb, void *data)
        return error;
 }
-static int sysfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *sysfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
        struct sysfs_super_info *info;
        enum kobj_ns_type type;
        struct super_block *sb;
        int error;
-        error = -ENOMEM;
        info = kzalloc(sizeof(*info), GFP_KERNEL);
        if (!info)
-                goto out;
+                return ERR_PTR(-ENOMEM);
        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
                info->ns[type] = kobj_ns_current(type);
@@ -114,24 +113,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type,
        sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
        if (IS_ERR(sb) || sb->s_fs_info != info)
                kfree(info);
-        if (IS_ERR(sb)) {
+        if (IS_ERR(sb))
-                error = PTR_ERR(sb);
+                return ERR_CAST(sb);
-                goto out;
-        }
        if (!sb->s_root) {
                sb->s_flags = flags;
                error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
                if (error) {
                        deactivate_locked_super(sb);
-                        goto out;
+                        return ERR_PTR(error);
                }
                sb->s_flags |= MS_ACTIVE;
        }
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        error = 0;
-out:
-        return error;
 }
 static void sysfs_kill_sb(struct super_block *sb)
@@ -147,7 +141,7 @@ static void sysfs_kill_sb(struct super_block *sb)
 static struct file_system_type sysfs_fs_type = {
        .name           = "sysfs",
-        .get_sb         = sysfs_get_sb,
+        .mount          = sysfs_mount,
        .kill_sb        = sysfs_kill_sb,
 };
@@ -189,11 +183,11 @@ int __init sysfs_init(void)
        err = register_filesystem(&sysfs_fs_type);
        if (!err) {
-                sysfs_mount = kern_mount(&sysfs_fs_type);
+                sysfs_mnt = kern_mount(&sysfs_fs_type);
-                if (IS_ERR(sysfs_mount)) {
+                if (IS_ERR(sysfs_mnt)) {
                        printk(KERN_ERR "sysfs: could not mount!\n");
-                        err = PTR_ERR(sysfs_mount);
+                        err = PTR_ERR(sysfs_mnt);
-                        sysfs_mount = NULL;
+                        sysfs_mnt = NULL;
                        unregister_filesystem(&sysfs_fs_type);
                        goto out_err;
                }
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 33e047b59b8d..11e7f7d11cd0 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -126,7 +126,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        return add_nondir(dentry, inode);
 }
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index a0b0cda6927e..3d9c62be0c10 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -526,23 +526,22 @@ failed:
 /* Every kernel module contains stuff like this. */
-static int sysv_get_sb(struct file_system_type *fs_type,
+static struct dentry *sysv_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
-                           mnt);
 }
-static int v7_get_sb(struct file_system_type *fs_type,
+static struct dentry *v7_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super);
 }
 static struct file_system_type sysv_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "sysv",
-        .get_sb         = sysv_get_sb,
+        .mount          = sysv_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -550,7 +549,7 @@ static struct file_system_type sysv_fs_type = {
 static struct file_system_type v7_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "v7",
-        .get_sb         = v7_get_sb,
+        .mount          = v7_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 87ebcce72213..14f64b689d7f 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -550,7 +550,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
        lock_2_inodes(dir, inode);
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        inode->i_ctime = ubifs_current_time(inode);
        dir->i_size += sz_change;
        dir_ui->ui_size = dir->i_size;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 9a47c9f0ad07..91fac54c70e3 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2038,8 +2038,8 @@ static int sb_test(struct super_block *sb, void *data)
        return c->vi.cdev == *dev;
 }
-static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
-                        const char *name, void *data, struct vfsmount *mnt)
+                        const char *name, void *data)
 {
        struct ubi_volume_desc *ubi;
        struct ubi_volume_info vi;
@@ -2057,7 +2057,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
        if (IS_ERR(ubi)) {
                dbg_err("cannot open \"%s\", error %d",
                        name, (int)PTR_ERR(ubi));
-                return PTR_ERR(ubi);
+                return ERR_CAST(ubi);
        }
        ubi_get_volume_info(ubi, &vi);
@@ -2095,20 +2095,19 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
        /* 'fill_super()' opens ubi again so we must close it here */
        ubi_close_volume(ubi);
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 out_deact:
        deactivate_locked_super(sb);
 out_close:
        ubi_close_volume(ubi);
-        return err;
+        return ERR_PTR(err);
 }
 static struct file_system_type ubifs_fs_type = {
        .name    = "ubifs",
        .owner   = THIS_MODULE,
-        .get_sb  = ubifs_get_sb,
+        .mount   = ubifs_mount,
        .kill_sb = kill_anon_super,
 };
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index bf5fc674193c..6d8dc02baebb 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1101,7 +1101,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        inc_nlink(inode);
        inode->i_ctime = current_fs_time(inode->i_sb);
        mark_inode_dirty(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(dentry, inode);
        unlock_kernel();
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 76f3d6d97b40..4a5c7c61836a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -107,17 +107,16 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
 }
 /* UDF filesystem type */
-static int udf_get_sb(struct file_system_type *fs_type,
+static struct dentry *udf_mount(struct file_system_type *fs_type,
-                      int flags, const char *dev_name, void *data,
+                      int flags, const char *dev_name, void *data)
-                      struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super);
 }
 static struct file_system_type udf_fstype = {
        .owner          = THIS_MODULE,
        .name           = "udf",
-        .get_sb         = udf_get_sb,
+        .mount          = udf_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index b056f02b1fb3..12f39b9e4437 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        error = ufs_add_nondir(dentry, inode);
        unlock_kernel();
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 6b9be90dae7d..2c47daed56da 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1454,16 +1454,16 @@ static const struct super_operations ufs_super_ops = {
        .show_options   = ufs_show_options,
 };
-static int ufs_get_sb(struct file_system_type *fs_type,
+static struct dentry *ufs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
 }
 static struct file_system_type ufs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ufs",
-        .get_sb         = ufs_get_sb,
+        .mount          = ufs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 480f28127f09..6100ec0fa1d4 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,7 @@ config XFS_FS
 config XFS_QUOTA
        bool "XFS Quota support"
        depends on XFS_FS
+        select QUOTACTL
        help
          If you say Y here, you will be able to set limits for disk usage on
          a per user and/or a per group basis under XFS.  XFS considers quota
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index b552f816de15..c9af48fffcd7 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1139,8 +1139,7 @@ xfs_vm_writepage(
                                type = IO_DELAY;
                                flags = BMAPI_ALLOCATE;
-                                if (wbc->sync_mode == WB_SYNC_NONE &&
+                                if (wbc->sync_mode == WB_SYNC_NONE)
-                                    wbc->nonblocking)
                                        flags |= BMAPI_TRYLOCK;
                        }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ba5312802aa9..63fd2c07cb57 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1580,6 +1580,7 @@ xfs_mapping_buftarg(
                        XFS_BUFTARG_NAME(btp));
                return ENOMEM;
        }
+        inode->i_ino = get_next_ino();
        inode->i_mode = S_IFBLK;
        inode->i_bdev = bdev;
        inode->i_rdev = bdev->bd_dev;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index ec858e09d546..96107efc0c61 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -317,7 +317,7 @@ xfs_vn_link(
        if (unlikely(error))
                return -error;
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(dentry, inode);
        return 0;
 }
@@ -760,7 +760,9 @@ xfs_setup_inode(
        inode->i_ino = ip->i_ino;
        inode->i_state = I_NEW;
-        inode_add_to_lists(ip->i_mount->m_super, inode);
+        inode_sb_list_add(inode);
+        insert_inode_hash(inode);
        inode->i_mode   = ip->i_d.di_mode;
        inode->i_nlink  = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index ab31ce5aeaf9..9f3a78fe6ae4 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -576,7 +576,7 @@ xfs_max_file_offset(
        /* Figure out maximum filesize, on Linux this can depend on
         * the filesystem blocksize (on 32 bit platforms).
-         * __block_prepare_write does this in an [unsigned] long...
+         * __block_write_begin does this in an [unsigned] long...
         *      page->index << (PAGE_CACHE_SHIFT - bbits)
         * So, for page sized blocks (4K on 32 bit platforms),
         * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
@@ -1609,16 +1609,14 @@ xfs_fs_fill_super(
        goto out_free_sb;
 }
-STATIC int
+STATIC struct dentry *
-xfs_fs_get_sb(
+xfs_fs_mount(
        struct file_system_type *fs_type,
        int                     flags,
        const char              *dev_name,
-        void                    *data,
+        void                    *data)
-        struct vfsmount         *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
-                           mnt);
 }
 static const struct super_operations xfs_super_operations = {
@@ -1639,7 +1637,7 @@ static const struct super_operations xfs_super_operations = {
 static struct file_system_type xfs_fs_type = {
        .owner                  = THIS_MODULE,
        .name                   = "xfs",
-        .get_sb                 = xfs_fs_get_sb,
+        .mount                  = xfs_fs_mount,
        .kill_sb                = kill_block_super,
        .fs_flags               = FS_REQUIRES_DEV,
 };
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fac52290de90..fb2ca2e4cdc9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,7 +500,7 @@ void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
 #define IHOLD(ip) \
 do { \
        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-        atomic_inc(&(VFS_I(ip)->i_count)); \
+        ihold(VFS_I(ip)); \
        trace_xfs_ihold(ip, _THIS_IP_); \
 } while (0)