Merge branch 'x86/amd-nb' into x86/apic-cleanups

Reason: apic cleanup series depends on x86/apic, x86/amd-nb x86/platform Conflicts: arch/x86/include/asm/io_apic.h Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
author: Thomas Gleixner <tglx@linutronix.de> 2010-12-09 12:17:25 -0500
committer: Thomas Gleixner <tglx@linutronix.de> 2010-12-09 12:17:25 -0500
commit: d834a9dcecae834cd6b2bc5e50e1907738d9cf6a (patch)
tree: 0589d753465d3fe359ba451ba6cb7798df03aaa2 /fs
parent: a38c5380ef9f088be9f49b6e4c5d80af8b1b5cd4 (diff)
parent: f658bcfb2607bf0808966a69cf74135ce98e5c2d (diff)
550 files changed, 19631 insertions, 20310 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 795233702a4e..7e0511476797 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -17,3 +17,16 @@ config 9P_FSCACHE
          Choose Y here to enable persistent, read-only local
          caching support for 9p clients using FS-Cache
+config 9P_FS_POSIX_ACL
+        bool "9P POSIX Access Control Lists"
+        depends on 9P_FS
+        select FS_POSIX_ACL
+        help
+          POSIX Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
+          To learn more about Access Control Lists, visit the POSIX ACLs for
+          Linux website <http://acl.bestbits.at/>.
+          If you don't know what Access Control Lists are, say N
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 91fba025fcbe..f8ba37effd1b 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_9P_FS) := 9p.o
        xattr_user.o
 9p-$(CONFIG_9P_FSCACHE) += cache.o
+9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
new file mode 100644
index 000000000000..12d602351dbe
--- /dev/null
+++ b/fs/9p/acl.c
@@ -0,0 +1,392 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/posix_acl_xattr.h>
+#include "xattr.h"
+#include "acl.h"
+#include "v9fs_vfs.h"
+#include "v9fs.h"
+static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
+{
+        ssize_t size;
+        void *value = NULL;
+        struct posix_acl *acl = NULL;;
+        size = v9fs_fid_xattr_get(fid, name, NULL, 0);
+        if (size > 0) {
+                value = kzalloc(size, GFP_NOFS);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                size = v9fs_fid_xattr_get(fid, name, value, size);
+                if (size > 0) {
+                        acl = posix_acl_from_xattr(value, size);
+                        if (IS_ERR(acl))
+                                goto err_out;
+                }
+        } else if (size == -ENODATA || size == 0 ||
+                   size == -ENOSYS || size == -EOPNOTSUPP) {
+                acl = NULL;
+        } else
+                acl = ERR_PTR(-EIO);
+err_out:
+        kfree(value);
+        return acl;
+}
+int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+        int retval = 0;
+        struct posix_acl *pacl, *dacl;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+                set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
+                set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
+                return 0;
+        }
+        /* get the default/access acl values and cache them */
+        dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
+        pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
+        if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
+                set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
+                set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
+                posix_acl_release(dacl);
+                posix_acl_release(pacl);
+        } else
+                retval = -EIO;
+        return retval;
+}
+static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
+{
+        struct posix_acl *acl;
+        /*
+         * 9p Always cache the acl value when
+         * instantiating the inode (v9fs_inode_from_fid)
+         */
+        acl = get_cached_acl(inode, type);
+        BUG_ON(acl == ACL_NOT_CACHED);
+        return acl;
+}
+int v9fs_check_acl(struct inode *inode, int mask)
+{
+        struct posix_acl *acl;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+                /*
+                 * On access = client mode get the acl
+                 * values from the server
+                 */
+                return 0;
+        }
+        acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                int error = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                return error;
+        }
+        return -EAGAIN;
+}
+static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
+{
+        int retval;
+        char *name;
+        size_t size;
+        void *buffer;
+        struct inode *inode = dentry->d_inode;
+        set_cached_acl(inode, type, acl);
+        /* Set a setxattr request to server */
+        size = posix_acl_xattr_size(acl->a_count);
+        buffer = kmalloc(size, GFP_KERNEL);
+        if (!buffer)
+                return -ENOMEM;
+        retval = posix_acl_to_xattr(acl, buffer, size);
+        if (retval < 0)
+                goto err_free_out;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name = POSIX_ACL_XATTR_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                name = POSIX_ACL_XATTR_DEFAULT;
+                break;
+        default:
+                BUG();
+        }
+        retval = v9fs_xattr_set(dentry, name, buffer, size, 0);
+err_free_out:
+        kfree(buffer);
+        return retval;
+}
+int v9fs_acl_chmod(struct dentry *dentry)
+{
+        int retval = 0;
+        struct posix_acl *acl, *clone;
+        struct inode *inode = dentry->d_inode;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
+        if (acl) {
+                clone = posix_acl_clone(acl, GFP_KERNEL);
+                posix_acl_release(acl);
+                if (!clone)
+                        return -ENOMEM;
+                retval = posix_acl_chmod_masq(clone, inode->i_mode);
+                if (!retval)
+                        retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone);
+                posix_acl_release(clone);
+        }
+        return retval;
+}
+int v9fs_set_create_acl(struct dentry *dentry,
+                        struct posix_acl *dpacl, struct posix_acl *pacl)
+{
+        if (dpacl)
+                v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
+        if (pacl)
+                v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
+        posix_acl_release(dpacl);
+        posix_acl_release(pacl);
+        return 0;
+}
+int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+                  struct posix_acl **dpacl, struct posix_acl **pacl)
+{
+        int retval = 0;
+        mode_t mode = *modep;
+        struct posix_acl *acl = NULL;
+        if (!S_ISLNK(mode)) {
+                acl = v9fs_get_cached_acl(dir, ACL_TYPE_DEFAULT);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                if (!acl)
+                        mode &= ~current_umask();
+        }
+        if (acl) {
+                struct posix_acl *clone;
+                if (S_ISDIR(mode))
+                        *dpacl = acl;
+                clone = posix_acl_clone(acl, GFP_NOFS);
+                retval = -ENOMEM;
+                if (!clone)
+                        goto cleanup;
+                retval = posix_acl_create_masq(clone, &mode);
+                if (retval < 0) {
+                        posix_acl_release(clone);
+                        goto cleanup;
+                }
+                if (retval > 0)
+                        *pacl = clone;
+        }
+        *modep  = mode;
+        return 0;
+cleanup:
+        posix_acl_release(acl);
+        return retval;
+}
+static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
+                               void *buffer, size_t size, int type)
+{
+        char *full_name;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                full_name =  POSIX_ACL_XATTR_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                full_name = POSIX_ACL_XATTR_DEFAULT;
+                break;
+        default:
+                BUG();
+        }
+        return v9fs_xattr_get(dentry, full_name, buffer, size);
+}
+static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
+                              void *buffer, size_t size, int type)
+{
+        struct v9fs_session_info *v9ses;
+        struct posix_acl *acl;
+        int error;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        /*
+         * We allow set/get/list of acl when access=client is not specified
+         */
+        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+                return v9fs_remote_get_acl(dentry, name, buffer, size, type);
+        acl = v9fs_get_cached_acl(dentry->d_inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        error = posix_acl_to_xattr(acl, buffer, size);
+        posix_acl_release(acl);
+        return error;
+}
+static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
+                              const void *value, size_t size,
+                              int flags, int type)
+{
+        char *full_name;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                full_name =  POSIX_ACL_XATTR_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                full_name = POSIX_ACL_XATTR_DEFAULT;
+                break;
+        default:
+                BUG();
+        }
+        return v9fs_xattr_set(dentry, full_name, value, size, flags);
+}
+static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
+                              const void *value, size_t size,
+                              int flags, int type)
+{
+        int retval;
+        struct posix_acl *acl;
+        struct v9fs_session_info *v9ses;
+        struct inode *inode = dentry->d_inode;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        /*
+         * set the attribute on the remote. Without even looking at the
+         * xattr value. We leave it to the server to validate
+         */
+        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+                return v9fs_remote_set_acl(dentry, name,
+                                           value, size, flags, type);
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (!is_owner_or_cap(inode))
+                return -EPERM;
+        if (value) {
+                /* update the cached acl value */
+                acl = posix_acl_from_xattr(value, size);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                else if (acl) {
+                        retval = posix_acl_valid(acl);
+                        if (retval)
+                                goto err_out;
+                }
+        } else
+                acl = NULL;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name = POSIX_ACL_XATTR_ACCESS;
+                if (acl) {
+                        mode_t mode = inode->i_mode;
+                        retval = posix_acl_equiv_mode(acl, &mode);
+                        if (retval < 0)
+                                goto err_out;
+                        else {
+                                struct iattr iattr;
+                                if (retval == 0) {
+                                        /*
+                                         * ACL can be represented
+                                         * by the mode bits. So don't
+                                         * update ACL.
+                                         */
+                                        acl = NULL;
+                                        value = NULL;
+                                        size = 0;
+                                }
+                                /* Updte the mode bits */
+                                iattr.ia_mode = ((mode & S_IALLUGO) |
+                                                 (inode->i_mode & ~S_IALLUGO));
+                                iattr.ia_valid = ATTR_MODE;
+                                /* FIXME should we update ctime ?
+                                 * What is the following setxattr update the
+                                 * mode ?
+                                 */
+                                v9fs_vfs_setattr_dotl(dentry, &iattr);
+                        }
+                }
+                break;
+        case ACL_TYPE_DEFAULT:
+                name = POSIX_ACL_XATTR_DEFAULT;
+                if (!S_ISDIR(inode->i_mode)) {
+                        retval = -EINVAL;
+                        goto err_out;
+                }
+                break;
+        default:
+                BUG();
+        }
+        retval = v9fs_xattr_set(dentry, name, value, size, flags);
+        if (!retval)
+                set_cached_acl(inode, type, acl);
+err_out:
+        posix_acl_release(acl);
+        return retval;
+}
+const struct xattr_handler v9fs_xattr_acl_access_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_ACCESS,
+        .get    = v9fs_xattr_get_acl,
+        .set    = v9fs_xattr_set_acl,
+};
+const struct xattr_handler v9fs_xattr_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
+        .get    = v9fs_xattr_get_acl,
+        .set    = v9fs_xattr_set_acl,
+};
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
new file mode 100644
index 000000000000..59e18c2e8c7e
--- /dev/null
+++ b/fs/9p/acl.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#ifndef FS_9P_ACL_H
+#define FS_9P_ACL_H
+#ifdef CONFIG_9P_FS_POSIX_ACL
+extern int v9fs_get_acl(struct inode *, struct p9_fid *);
+extern int v9fs_check_acl(struct inode *inode, int mask);
+extern int v9fs_acl_chmod(struct dentry *);
+extern int v9fs_set_create_acl(struct dentry *,
+                               struct posix_acl *, struct posix_acl *);
+extern int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+                         struct posix_acl **dpacl, struct posix_acl **pacl);
+#else
+#define v9fs_check_acl NULL
+static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
+{
+        return 0;
+}
+static inline int v9fs_acl_chmod(struct dentry *dentry)
+{
+        return 0;
+}
+static inline int v9fs_set_create_acl(struct dentry *dentry,
+                                      struct posix_acl *dpacl,
+                                      struct posix_acl *pacl)
+{
+        return 0;
+}
+static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+                                struct posix_acl **dpacl,
+                                struct posix_acl **pacl)
+{
+        return 0;
+}
+#endif
+#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 6406f896bf95..b00223c99d70 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -149,6 +149,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
        switch (access) {
        case V9FS_ACCESS_SINGLE:
        case V9FS_ACCESS_USER:
+        case V9FS_ACCESS_CLIENT:
                uid = current_fsuid();
                any = 0;
                break;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 38dc0e067599..2f77cd33ba83 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -193,7 +193,17 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                                v9ses->flags |= V9FS_ACCESS_USER;
                        else if (strcmp(s, "any") == 0)
                                v9ses->flags |= V9FS_ACCESS_ANY;
-                        else {
+                        else if (strcmp(s, "client") == 0) {
+#ifdef CONFIG_9P_FS_POSIX_ACL
+                                v9ses->flags |= V9FS_ACCESS_CLIENT;
+#else
+                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                        "access=client option not supported\n");
+                                kfree(s);
+                                ret = -EINVAL;
+                                goto free_and_return;
+#endif
+                        } else {
                                v9ses->flags |= V9FS_ACCESS_SINGLE;
                                v9ses->uid = simple_strtoul(s, &e, 10);
                                if (*e != '\0')
@@ -278,6 +288,16 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
+        if (!v9fs_proto_dotl(v9ses) &&
+            ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+                /*
+                 * We support ACCESS_CLIENT only for dotl.
+                 * Fall back to ACCESS_USER
+                 */
+                v9ses->flags &= ~V9FS_ACCESS_MASK;
+                v9ses->flags |= V9FS_ACCESS_USER;
+        }
+        /*FIXME !! */
        /* for legacy mode, fall back to V9FS_ACCESS_ANY */
        if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
                ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 4c963c9fc41f..cb6396855e2d 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -33,13 +33,17 @@
 *
 * Session flags reflect options selected by users at mount time
 */
+#define V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \
+                         V9FS_ACCESS_USER |   \
+                         V9FS_ACCESS_CLIENT)
+#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
 enum p9_session_flags {
        V9FS_PROTO_2000U        = 0x01,
        V9FS_PROTO_2000L        = 0x02,
        V9FS_ACCESS_SINGLE      = 0x04,
        V9FS_ACCESS_USER        = 0x08,
-        V9FS_ACCESS_ANY         = 0x0C,
+        V9FS_ACCESS_CLIENT      = 0x10
-        V9FS_ACCESS_MASK        = 0x0C,
 };
 /* possible values of ->cache */
@@ -113,8 +117,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses);
 void v9fs_session_cancel(struct v9fs_session_info *v9ses);
 void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
-#define V9FS_MAGIC 0x01021997
 /* other default globals */
 #define V9FS_PORT       564
 #define V9FS_DEFUSER    "nobody"
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 88418c419ea7..bab0eac873f4 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -64,3 +64,7 @@ int v9fs_uflags2omode(int uflags, int extended);
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
+int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
+int v9fs_file_fsync_dotl(struct file *filp, int datasync);
+#define P9_LOCK_TIMEOUT (30*HZ)
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 90e38449f4b3..b7f2a8e3863e 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -154,10 +154,40 @@ static int v9fs_launder_page(struct page *page)
        return 0;
 }
+/**
+ * v9fs_direct_IO - 9P address space operation for direct I/O
+ * @rw: direction (read or write)
+ * @iocb: target I/O control block
+ * @iov: array of vectors that define I/O buffer
+ * @pos: offset in file to begin the operation
+ * @nr_segs: size of iovec array
+ *
+ * The presence of v9fs_direct_IO() in the address space ops vector
+ * allowes open() O_DIRECT flags which would have failed otherwise.
+ *
+ * In the non-cached mode, we shunt off direct read and write requests before
+ * the VFS gets them, so this method should never be called.
+ *
+ * Direct IO is not 'yet' supported in the cached mode. Hence when
+ * this routine is called through generic_file_aio_read(), the read/write fails
+ * with an error.
+ *
+ */
+ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+                loff_t pos, unsigned long nr_segs)
+{
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
+                        "off/no(%lld/%lu) EINVAL\n",
+                        iocb->ki_filp->f_path.dentry->d_name.name,
+                        (long long) pos, nr_segs);
+        return -EINVAL;
+}
 const struct address_space_operations v9fs_addr_operations = {
      .readpage = v9fs_vfs_readpage,
      .readpages = v9fs_vfs_readpages,
      .releasepage = v9fs_release_page,
      .invalidatepage = v9fs_invalidate_page,
      .launder_page = v9fs_launder_page,
+      .direct_IO = v9fs_direct_IO,
 };
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 899f168fd19c..b84ebe8cefed 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -242,7 +242,8 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
                while (rdir->head < rdir->tail) {
                        err = p9dirent_read(rdir->buf + rdir->head,
-                                                buflen - rdir->head, &curdirent,
+                                                rdir->tail - rdir->head,
+                                                &curdirent,
                                                fid->clnt->proto_version);
                        if (err < 0) {
                                P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
@@ -314,4 +315,5 @@ const struct file_operations v9fs_dir_operations_dotl = {
        .readdir = v9fs_dir_readdir_dotl,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
+        .fsync = v9fs_file_fsync_dotl,
 };
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index e97c92bd6f16..240c30674396 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -33,6 +33,7 @@
 #include <linux/inet.h>
 #include <linux/list.h>
 #include <linux/pagemap.h>
+#include <linux/utsname.h>
 #include <asm/uaccess.h>
 #include <linux/idr.h>
 #include <net/9p/9p.h>
@@ -44,6 +45,7 @@
 #include "cache.h"
 static const struct file_operations v9fs_cached_file_operations;
+static const struct file_operations v9fs_cached_file_operations_dotl;
 /**
 * v9fs_file_open - open a file (or directory)
@@ -92,6 +94,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
                /* enable cached file options */
                if(file->f_op == &v9fs_file_operations)
                        file->f_op = &v9fs_cached_file_operations;
+                else if (file->f_op == &v9fs_file_operations_dotl)
+                        file->f_op = &v9fs_cached_file_operations_dotl;
 #ifdef CONFIG_9P_FSCACHE
                v9fs_cache_inode_set_cookie(inode, file);
@@ -130,6 +134,206 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
        return res;
 }
+static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+        struct p9_flock flock;
+        struct p9_fid *fid;
+        uint8_t status;
+        int res = 0;
+        unsigned char fl_type;
+        fid = filp->private_data;
+        BUG_ON(fid == NULL);
+        if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
+                BUG();
+        res = posix_lock_file_wait(filp, fl);
+        if (res < 0)
+                goto out;
+        /* convert posix lock to p9 tlock args */
+        memset(&flock, 0, sizeof(flock));
+        flock.type = fl->fl_type;
+        flock.start = fl->fl_start;
+        if (fl->fl_end == OFFSET_MAX)
+                flock.length = 0;
+        else
+                flock.length = fl->fl_end - fl->fl_start + 1;
+        flock.proc_id = fl->fl_pid;
+        flock.client_id = utsname()->nodename;
+        if (IS_SETLKW(cmd))
+                flock.flags = P9_LOCK_FLAGS_BLOCK;
+        /*
+         * if its a blocked request and we get P9_LOCK_BLOCKED as the status
+         * for lock request, keep on trying
+         */
+        for (;;) {
+                res = p9_client_lock_dotl(fid, &flock, &status);
+                if (res < 0)
+                        break;
+                if (status != P9_LOCK_BLOCKED)
+                        break;
+                if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
+                        break;
+                schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
+        }
+        /* map 9p status to VFS status */
+        switch (status) {
+        case P9_LOCK_SUCCESS:
+                res = 0;
+                break;
+        case P9_LOCK_BLOCKED:
+                res = -EAGAIN;
+                break;
+        case P9_LOCK_ERROR:
+        case P9_LOCK_GRACE:
+                res = -ENOLCK;
+                break;
+        default:
+                BUG();
+        }
+        /*
+         * incase server returned error for lock request, revert
+         * it locally
+         */
+        if (res < 0 && fl->fl_type != F_UNLCK) {
+                fl_type = fl->fl_type;
+                fl->fl_type = F_UNLCK;
+                res = posix_lock_file_wait(filp, fl);
+                fl->fl_type = fl_type;
+        }
+out:
+        return res;
+}
+static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
+{
+        struct p9_getlock glock;
+        struct p9_fid *fid;
+        int res = 0;
+        fid = filp->private_data;
+        BUG_ON(fid == NULL);
+        posix_test_lock(filp, fl);
+        /*
+         * if we have a conflicting lock locally, no need to validate
+         * with server
+         */
+        if (fl->fl_type != F_UNLCK)
+                return res;
+        /* convert posix lock to p9 tgetlock args */
+        memset(&glock, 0, sizeof(glock));
+        glock.type = fl->fl_type;
+        glock.start = fl->fl_start;
+        if (fl->fl_end == OFFSET_MAX)
+                glock.length = 0;
+        else
+                glock.length = fl->fl_end - fl->fl_start + 1;
+        glock.proc_id = fl->fl_pid;
+        glock.client_id = utsname()->nodename;
+        res = p9_client_getlock_dotl(fid, &glock);
+        if (res < 0)
+                return res;
+        if (glock.type != F_UNLCK) {
+                fl->fl_type = glock.type;
+                fl->fl_start = glock.start;
+                if (glock.length == 0)
+                        fl->fl_end = OFFSET_MAX;
+                else
+                        fl->fl_end = glock.start + glock.length - 1;
+                fl->fl_pid = glock.proc_id;
+        } else
+                fl->fl_type = F_UNLCK;
+        return res;
+}
+/**
+ * v9fs_file_lock_dotl - lock a file (or directory)
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
+{
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        int ret = -ENOLCK;
+        P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+                                cmd, fl, filp->f_path.dentry->d_name.name);
+        /* No mandatory locks */
+        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+                goto out_err;
+        if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+                filemap_write_and_wait(inode->i_mapping);
+                invalidate_mapping_pages(&inode->i_data, 0, -1);
+        }
+        if (IS_SETLK(cmd) || IS_SETLKW(cmd))
+                ret = v9fs_file_do_lock(filp, cmd, fl);
+        else if (IS_GETLK(cmd))
+                ret = v9fs_file_getlock(filp, fl);
+        else
+                ret = -EINVAL;
+out_err:
+        return ret;
+}
+/**
+ * v9fs_file_flock_dotl - lock a file
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
+ *
+ */
+static int v9fs_file_flock_dotl(struct file *filp, int cmd,
+        struct file_lock *fl)
+{
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        int ret = -ENOLCK;
+        P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+                                cmd, fl, filp->f_path.dentry->d_name.name);
+        /* No mandatory locks */
+        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+                goto out_err;
+        if (!(fl->fl_flags & FL_FLOCK))
+                goto out_err;
+        if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+                filemap_write_and_wait(inode->i_mapping);
+                invalidate_mapping_pages(&inode->i_data, 0, -1);
+        }
+        /* Convert flock to posix lock */
+        fl->fl_owner = (fl_owner_t)filp;
+        fl->fl_start = 0;
+        fl->fl_end = OFFSET_MAX;
+        fl->fl_flags |= FL_POSIX;
+        fl->fl_flags ^= FL_FLOCK;
+        if (IS_SETLK(cmd) | IS_SETLKW(cmd))
+                ret = v9fs_file_do_lock(filp, cmd, fl);
+        else
+                ret = -EINVAL;
+out_err:
+        return ret;
+}
 /**
 * v9fs_file_readn - read from a file
 * @filp: file pointer to read
@@ -219,7 +423,9 @@ static ssize_t
 v9fs_file_write(struct file *filp, const char __user * data,
                size_t count, loff_t * offset)
 {
-        int n, rsize, total = 0;
+        ssize_t retval;
+        size_t total = 0;
+        int n;
        struct p9_fid *fid;
        struct p9_client *clnt;
        struct inode *inode = filp->f_path.dentry->d_inode;
@@ -232,14 +438,19 @@ v9fs_file_write(struct file *filp, const char __user * data,
        fid = filp->private_data;
        clnt = fid->clnt;
-        rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ;
+        retval = generic_write_checks(filp, &origin, &count, 0);
+        if (retval)
+                goto out;
-        do {
+        retval = -EINVAL;
-                if (count < rsize)
+        if ((ssize_t) count < 0)
-                        rsize = count;
+                goto out;
+        retval = 0;
+        if (!count)
+                goto out;
-                n = p9_client_write(fid, NULL, data+total, origin+total,
+        do {
-                                                                        rsize);
+                n = p9_client_write(fid, NULL, data+total, origin+total, count);
                if (n <= 0)
                        break;
                count -= n;
@@ -258,9 +469,11 @@ v9fs_file_write(struct file *filp, const char __user * data,
        }
        if (n < 0)
-                return n;
+                retval = n;
+        else
-        return total;
+                retval = total;
+out:
+        return retval;
 }
 static int v9fs_file_fsync(struct file *filp, int datasync)
@@ -278,6 +491,20 @@ static int v9fs_file_fsync(struct file *filp, int datasync)
        return retval;
 }
+int v9fs_file_fsync_dotl(struct file *filp, int datasync)
+{
+        struct p9_fid *fid;
+        int retval;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
+                        filp, datasync);
+        fid = filp->private_data;
+        retval = p9_client_fsync(fid, datasync);
+        return retval;
+}
 static const struct file_operations v9fs_cached_file_operations = {
        .llseek = generic_file_llseek,
        .read = do_sync_read,
@@ -290,6 +517,19 @@ static const struct file_operations v9fs_cached_file_operations = {
        .fsync = v9fs_file_fsync,
 };
+static const struct file_operations v9fs_cached_file_operations_dotl = {
+        .llseek = generic_file_llseek,
+        .read = do_sync_read,
+        .aio_read = generic_file_aio_read,
+        .write = v9fs_file_write,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+        .lock = v9fs_file_lock_dotl,
+        .flock = v9fs_file_flock_dotl,
+        .mmap = generic_file_readonly_mmap,
+        .fsync = v9fs_file_fsync_dotl,
+};
 const struct file_operations v9fs_file_operations = {
        .llseek = generic_file_llseek,
        .read = v9fs_file_read,
@@ -307,7 +547,8 @@ const struct file_operations v9fs_file_operations_dotl = {
        .write = v9fs_file_write,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
-        .lock = v9fs_file_lock,
+        .lock = v9fs_file_lock_dotl,
+        .flock = v9fs_file_flock_dotl,
        .mmap = generic_file_readonly_mmap,
-        .fsync = v9fs_file_fsync,
+        .fsync = v9fs_file_fsync_dotl,
 };
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9e670d527646..34bf71b56542 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -36,6 +36,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -44,6 +45,7 @@
 #include "fid.h"
 #include "cache.h"
 #include "xattr.h"
+#include "acl.h"
 static const struct inode_operations v9fs_dir_inode_operations;
 static const struct inode_operations v9fs_dir_inode_operations_dotu;
@@ -53,6 +55,10 @@ static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
 static const struct inode_operations v9fs_symlink_inode_operations_dotl;
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
+                    dev_t rdev);
 /**
 * unixmode2p9mode - convert unix mode bits to plan 9
 * @v9ses: v9fs session information
@@ -500,6 +506,11 @@ v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
        v9fs_vcookie_set_qid(ret, &st->qid);
        v9fs_cache_inode_get_cookie(ret);
 #endif
+        err = v9fs_get_acl(ret, fid);
+        if (err) {
+                iput(ret);
+                goto error;
+        }
        kfree(st);
        return ret;
 error:
@@ -553,13 +564,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
        return retval;
 }
-static int
-v9fs_open_created(struct inode *inode, struct file *file)
-{
-        return 0;
-}
 /**
 * v9fs_create - Create a file
 * @v9ses: session information
@@ -655,29 +659,37 @@ error:
 */
 static int
-v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                struct nameidata *nd)
 {
        int err = 0;
        char *name = NULL;
        gid_t gid;
        int flags;
+        mode_t mode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid = NULL;
        struct p9_fid *dfid, *ofid;
        struct file *filp;
        struct p9_qid qid;
        struct inode *inode;
+        struct posix_acl *pacl = NULL, *dacl = NULL;
        v9ses = v9fs_inode2v9ses(dir);
        if (nd && nd->flags & LOOKUP_OPEN)
                flags = nd->intent.open.flags - 1;
-        else
+        else {
-                flags = O_RDWR;
+                /*
+                 * create call without LOOKUP_OPEN is due
+                 * to mknod of regular files. So use mknod
+                 * operation.
+                 */
+                return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+        }
        name = (char *) dentry->d_name.name;
        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
-                        "mode:0x%x\n", name, flags, mode);
+                        "mode:0x%x\n", name, flags, omode);
        dfid = v9fs_fid_lookup(dentry->d_parent);
        if (IS_ERR(dfid)) {
@@ -695,6 +707,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
        }
        gid = v9fs_get_fsgid_for_create(dir);
+        mode = omode;
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in creat %d\n", err);
+                goto error;
+        }
        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
        if (err < 0) {
                P9_DPRINTK(P9_DEBUG_VFS,
@@ -702,46 +723,52 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
                                err);
                goto error;
        }
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE ||
+            (nd && nd->flags & LOOKUP_OPEN)) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
-        /* No need to populate the inode if we are not opening the file AND
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-         * not in cached mode.
+                if (IS_ERR(inode)) {
-         */
+                        err = PTR_ERR(inode);
-        if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) {
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-                /* Not in cached mode. No need to populate inode with stat */
+                                err);
-                dentry->d_op = &v9fs_dentry_operations;
+                        goto error;
-                p9_client_clunk(ofid);
+                }
-                d_instantiate(dentry, NULL);
-                return 0;
-        }
-        /* Now walk from the parent so we can get an unopened fid. */
-        fid = p9_client_walk(dfid, 1, &name, 1);
-        if (IS_ERR(fid)) {
-                err = PTR_ERR(fid);
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-                fid = NULL;
-                goto error;
-        }
-        /* instantiate inode and assign the unopened fid to dentry */
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
-        if (IS_ERR(inode)) {
-                err = PTR_ERR(inode);
-                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
-                goto error;
-        }
-        if (v9ses->cache)
                dentry->d_op = &v9fs_cached_dentry_operations;
-        else
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                /* The fid would get clunked via a dput */
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate
+                 * inode with stat. We need to get an inode
+                 * so that we can set the acl with dentry
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
                dentry->d_op = &v9fs_dentry_operations;
-        d_instantiate(dentry, inode);
+                d_instantiate(dentry, inode);
-        err = v9fs_fid_add(dentry, fid);
+        }
-        if (err < 0)
+        /* Now set the ACL based on the default value */
-                goto error;
+        v9fs_set_create_acl(dentry, dacl, pacl);
        /* if we are opening a file, assign the open fid to the file */
        if (nd && nd->flags & LOOKUP_OPEN) {
-                filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
+                filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
                if (IS_ERR(filp)) {
                        p9_client_clunk(ofid);
                        return PTR_ERR(filp);
@@ -800,7 +827,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        /* if we are opening a file, assign the open fid to the file */
        if (nd && nd->flags & LOOKUP_OPEN) {
-                filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
+                filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
                if (IS_ERR(filp)) {
                        err = PTR_ERR(filp);
                        goto error;
@@ -859,23 +886,28 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 *
 */
-static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
+static int v9fs_vfs_mkdir_dotl(struct inode *dir,
-                                        int mode)
+                               struct dentry *dentry, int omode)
 {
        int err;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid = NULL, *dfid = NULL;
        gid_t gid;
        char *name;
+        mode_t mode;
        struct inode *inode;
        struct p9_qid qid;
        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
        err = 0;
        v9ses = v9fs_inode2v9ses(dir);
-        mode |= S_IFDIR;
+        omode |= S_IFDIR;
+        if (dir->i_mode & S_ISGID)
+                omode |= S_ISGID;
        dir_dentry = v9fs_dentry_from_dir_inode(dir);
        dfid = v9fs_fid_lookup(dir_dentry);
        if (IS_ERR(dfid)) {
@@ -886,11 +918,14 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
        }
        gid = v9fs_get_fsgid_for_create(dir);
-        if (gid < 0) {
+        mode = omode;
-                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mkdir %d\n", err);
                goto error;
        }
        name = (char *) dentry->d_name.name;
        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
        if (err < 0)
@@ -920,7 +955,23 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
                if (err < 0)
                        goto error;
                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate
+                 * inode with stat. We need to get an inode
+                 * so that we can set the acl with dentry
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                dentry->d_op = &v9fs_dentry_operations;
+                d_instantiate(dentry, inode);
        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
 error:
        if (fid)
                p9_client_clunk(fid);
@@ -979,7 +1030,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        result = v9fs_fid_add(dentry, fid);
        if (result < 0)
-                goto error;
+                goto error_iput;
 inst_out:
        if (v9ses->cache)
@@ -990,6 +1041,8 @@ inst_out:
        d_add(dentry, inode);
        return NULL;
+error_iput:
+        iput(inode);
 error:
        p9_client_clunk(fid);
@@ -1237,7 +1290,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 *
 */
-static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 {
        int retval;
        struct v9fs_session_info *v9ses;
@@ -1279,6 +1332,12 @@ static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
        setattr_copy(dentry->d_inode, iattr);
        mark_inode_dirty(dentry->d_inode);
+        if (iattr->ia_valid & ATTR_MODE) {
+                /* We also want to update ACL when we update mode bits */
+                retval = v9fs_acl_chmod(dentry);
+                if (retval < 0)
+                        return retval;
+        }
        return 0;
 }
@@ -1473,7 +1532,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
        if (IS_ERR(fid))
                return PTR_ERR(fid);
-        if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))
+        if (!v9fs_proto_dotu(v9ses))
                return -EBADF;
        st = p9_client_stat(fid);
@@ -1616,11 +1675,6 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
        gid = v9fs_get_fsgid_for_create(dir);
-        if (gid < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid);
-                goto error;
-        }
        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
@@ -1789,9 +1843,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                kfree(st);
        } else {
                /* Caching disabled. No need to get upto date stat info.
-                 * This dentry will be released immediately. So, just i_count++
+                 * This dentry will be released immediately. So, just hold the
+                 * inode
                 */
-                atomic_inc(&old_dentry->d_inode->i_count);
+                ihold(old_dentry->d_inode);
        }
        dentry->d_op = old_dentry->d_op;
@@ -1854,21 +1909,23 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 *
 */
 static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                dev_t rdev)
 {
        int err;
        char *name;
+        mode_t mode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid = NULL, *dfid = NULL;
        struct inode *inode;
        gid_t gid;
        struct p9_qid qid;
        struct dentry *dir_dentry;
+        struct posix_acl *dacl = NULL, *pacl = NULL;
        P9_DPRINTK(P9_DEBUG_VFS,
                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
-                dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+                dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
        if (!new_valid_dev(rdev))
                return -EINVAL;
@@ -1884,11 +1941,14 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
        }
        gid = v9fs_get_fsgid_for_create(dir);
-        if (gid < 0) {
+        mode = omode;
-                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
+        /* Update mode based on ACL value */
+        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
+        if (err) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                           "Failed to get acl values in mknod %d\n", err);
                goto error;
        }
        name = (char *) dentry->d_name.name;
        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
@@ -1932,13 +1992,68 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
                dentry->d_op = &v9fs_dentry_operations;
                d_instantiate(dentry, inode);
        }
+        /* Now set the ACL based on the default value */
+        v9fs_set_create_acl(dentry, dacl, pacl);
 error:
        if (fid)
                p9_client_clunk(fid);
        return err;
 }
+static int
+v9fs_vfs_readlink_dotl(struct dentry *dentry, char *buffer, int buflen)
+{
+        int retval;
+        struct p9_fid *fid;
+        char *target = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
+        retval = -EPERM;
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        retval = p9_client_readlink(fid, &target);
+        if (retval < 0)
+                return retval;
+        strncpy(buffer, target, buflen);
+        P9_DPRINTK(P9_DEBUG_VFS, "%s -> %s\n", dentry->d_name.name, buffer);
+        retval = strnlen(buffer, buflen);
+        return retval;
+}
+/**
+ * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+static void *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+{
+        int len = 0;
+        char *link = __getname();
+        P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
+        if (!link)
+                link = ERR_PTR(-ENOMEM);
+        else {
+                len = v9fs_vfs_readlink_dotl(dentry, link, PATH_MAX);
+                if (len < 0) {
+                        __putname(link);
+                        link = ERR_PTR(len);
+                } else
+                        link[min(len, PATH_MAX-1)] = 0;
+        }
+        nd_set_link(nd, link);
+        return NULL;
+}
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -1969,7 +2084,7 @@ static const struct inode_operations v9fs_dir_inode_operations_dotl = {
        .getxattr = generic_getxattr,
        .removexattr = generic_removexattr,
        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
 };
 static const struct inode_operations v9fs_dir_inode_operations = {
@@ -1996,6 +2111,7 @@ static const struct inode_operations v9fs_file_inode_operations_dotl = {
        .getxattr = generic_getxattr,
        .removexattr = generic_removexattr,
        .listxattr = v9fs_listxattr,
+        .check_acl = v9fs_check_acl,
 };
 static const struct inode_operations v9fs_symlink_inode_operations = {
@@ -2007,8 +2123,8 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
 };
 static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
-        .readlink = generic_readlink,
+        .readlink = v9fs_vfs_readlink_dotl,
-        .follow_link = v9fs_vfs_follow_link,
+        .follow_link = v9fs_vfs_follow_link_dotl,
        .put_link = v9fs_vfs_put_link,
        .getattr = v9fs_vfs_getattr_dotl,
        .setattr = v9fs_vfs_setattr_dotl,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 1d12ba0ed3db..c55c614500ad 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -39,6 +39,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/statfs.h>
+#include <linux/magic.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -46,6 +47,7 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
 #include "xattr.h"
+#include "acl.h"
 static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
@@ -66,7 +68,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
 * v9fs_fill_super - populate superblock with info
 * @sb: superblock
 * @v9ses: session information
- * @flags: flags propagated from v9fs_get_sb()
+ * @flags: flags propagated from v9fs_mount()
 *
 */
@@ -88,22 +90,25 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
            MS_NOATIME;
+#ifdef CONFIG_9P_FS_POSIX_ACL
+        if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
+                sb->s_flags |= MS_POSIXACL;
+#endif
        save_mount_options(sb, data);
 }
 /**
- * v9fs_get_sb - mount a superblock
+ * v9fs_mount - mount a superblock
 * @fs_type: file system type
 * @flags: mount flags
 * @dev_name: device name that was mounted
 * @data: mount options
- * @mnt: mountpoint record to be instantiated
 *
 */
-static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data,
+                       const char *dev_name, void *data)
-                       struct vfsmount *mnt)
 {
        struct super_block *sb = NULL;
        struct inode *inode = NULL;
@@ -117,7 +122,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
        if (!v9ses)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        fid = v9fs_session_init(v9ses, dev_name, data);
        if (IS_ERR(fid)) {
@@ -149,7 +154,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                goto release_sb;
        }
        sb->s_root = root;
        if (v9fs_proto_dotl(v9ses)) {
                struct p9_stat_dotl *st = NULL;
                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
@@ -174,19 +178,21 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                p9stat_free(st);
                kfree(st);
        }
+        retval = v9fs_get_acl(inode, fid);
+        if (retval)
+                goto release_sb;
        v9fs_fid_add(root, fid);
        P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 clunk_fid:
        p9_client_clunk(fid);
 close_session:
        v9fs_session_close(v9ses);
        kfree(v9ses);
-        return retval;
+        return ERR_PTR(retval);
 release_sb:
        /*
         * we will do the session_close and root dentry release
@@ -196,7 +202,7 @@ release_sb:
         */
        p9_client_clunk(fid);
        deactivate_locked_super(sb);
-        return retval;
+        return ERR_PTR(retval);
 }
 /**
@@ -249,7 +255,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
        if (v9fs_proto_dotl(v9ses)) {
                res = p9_client_statfs(fid, &rs);
                if (res == 0) {
-                        buf->f_type = rs.type;
+                        buf->f_type = V9FS_MAGIC;
                        buf->f_bsize = rs.bsize;
                        buf->f_blocks = rs.blocks;
                        buf->f_bfree = rs.bfree;
@@ -292,7 +298,7 @@ static const struct super_operations v9fs_super_ops_dotl = {
 struct file_system_type v9fs_fs_type = {
        .name = "9p",
-        .get_sb = v9fs_get_sb,
+        .mount = v9fs_mount,
        .kill_sb = v9fs_kill_super,
        .owner = THIS_MODULE,
        .fs_flags = FS_RENAME_DOES_D_MOVE,
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index f88e5c2dc873..43ec7df84336 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -21,30 +21,13 @@
 #include "fid.h"
 #include "xattr.h"
-/*
+ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
- * v9fs_xattr_get()
+                           void *buffer, size_t buffer_size)
- *
- * Copy an extended attribute into the buffer
- * provided, or compute the buffer size required.
- * Buffer is NULL to compute the size of the buffer required.
- *
- * Returns a negative error number on failure, or the number of bytes
- * used / required on success.
- */
-ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
-                       void *buffer, size_t buffer_size)
 {
        ssize_t retval;
        int msize, read_count;
        u64 offset = 0, attr_size;
-        struct p9_fid *fid, *attr_fid;
+        struct p9_fid *attr_fid;
-        P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
-                __func__, name, buffer_size);
-        fid = v9fs_fid_lookup(dentry);
-        if (IS_ERR(fid))
-                return PTR_ERR(fid);
        attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
        if (IS_ERR(attr_fid)) {
@@ -88,6 +71,31 @@ error:
 }
+/*
+ * v9fs_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
+                       void *buffer, size_t buffer_size)
+{
+        struct p9_fid *fid;
+        P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
+                __func__, name, buffer_size);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        return v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
+}
 /*
 * v9fs_xattr_set()
 *
@@ -156,5 +164,9 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 const struct xattr_handler *v9fs_xattr_handlers[] = {
        &v9fs_xattr_user_handler,
+#ifdef CONFIG_9P_FS_POSIX_ACL
+        &v9fs_xattr_acl_access_handler,
+        &v9fs_xattr_acl_default_handler,
+#endif
        NULL
 };
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index 9ddf672ae5c4..eaa837c53bd5 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -15,10 +15,16 @@
 #define FS_9P_XATTR_H
 #include <linux/xattr.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
 extern const struct xattr_handler *v9fs_xattr_handlers[];
 extern struct xattr_handler v9fs_xattr_user_handler;
+extern const struct xattr_handler v9fs_xattr_acl_access_handler;
+extern const struct xattr_handler v9fs_xattr_acl_default_handler;
+extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
+                                  void *, size_t);
 extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
                              void *, size_t);
 extern int v9fs_xattr_set(struct dentry *, const char *,
diff --git a/fs/Kconfig b/fs/Kconfig
index 3d185308ec88..771f457402d4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,6 +47,9 @@ source "fs/nilfs2/Kconfig"
 endif # BLOCK
+config EXPORTFS
+        tristate
 config FILE_LOCKING
        bool "Enable POSIX file locking API" if EMBEDDED
        default y
@@ -59,7 +62,6 @@ source "fs/notify/Kconfig"
 source "fs/quota/Kconfig"
-source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
@@ -221,9 +223,6 @@ config LOCKD_V4
        depends on FILE_LOCKING
        default y
-config EXPORTFS
-        tristate
 config NFS_ACL_SUPPORT
        tristate
        select FS_POSIX_ACL
@@ -234,7 +233,6 @@ config NFS_COMMON
        default y
 source "net/sunrpc/Kconfig"
-source "fs/smbfs/Kconfig"
 source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index bb4cc5b8abc8..79e2ca7973b7 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC
 config CORE_DUMP_DEFAULT_ELF_HEADERS
        bool "Write ELF core dumps with partial segments"
-        default n
+        default y
        depends on BINFMT_ELF && ELF_CORE
        help
          ELF core dump files describe each memory mapping of the crashed
@@ -60,7 +60,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS
          inherited.  See Documentation/filesystems/proc.txt for details.
          This config option changes the default setting of coredump_filter
-          seen at boot time.  If unsure, say N.
+          seen at boot time.  If unsure, say Y.
 config BINFMT_FLAT
        bool "Kernel support for flat binaries"
diff --git a/fs/Makefile b/fs/Makefile
index e6ec1d309b1d..a7f7cef0c0c8 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -29,10 +29,7 @@ obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)            += compat.o compat_ioctl.o
+obj-$(CONFIG_NFSD_DEPRECATED)   += nfsctl.o
-nfsd-$(CONFIG_NFSD)             := nfsctl.o
-obj-y                           += $(nfsd-y) $(nfsd-m)
 obj-$(CONFIG_BINFMT_AOUT)       += binfmt_aout.o
 obj-$(CONFIG_BINFMT_EM86)       += binfmt_em86.o
 obj-$(CONFIG_BINFMT_MISC)       += binfmt_misc.o
@@ -91,7 +88,6 @@ obj-$(CONFIG_NFSD)		+= nfsd/
 obj-$(CONFIG_LOCKD)             += lockd/
 obj-$(CONFIG_NLS)               += nls/
 obj-$(CONFIG_SYSV_FS)           += sysv/
-obj-$(CONFIG_SMB_FS)            += smbfs/
 obj-$(CONFIG_CIFS)              += cifs/
 obj-$(CONFIG_NCP_FS)            += ncpfs/
 obj-$(CONFIG_HPFS_FS)           += hpfs/
@@ -104,7 +100,6 @@ obj-$(CONFIG_UBIFS_FS)		+= ubifs/
 obj-$(CONFIG_AFFS_FS)           += affs/
 obj-$(CONFIG_ROMFS_FS)          += romfs/
 obj-$(CONFIG_QNX4FS_FS)         += qnx4/
-obj-$(CONFIG_AUTOFS_FS)         += autofs/
 obj-$(CONFIG_AUTOFS4_FS)        += autofs4/
 obj-$(CONFIG_ADFS_FS)           += adfs/
 obj-$(CONFIG_FUSE_FS)           += fuse/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a74605..1dd5f34b3cf2 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,7 @@
 config ADFS_FS
        tristate "ADFS file system support (EXPERIMENTAL)"
        depends on BLOCK && EXPERIMENTAL
+        depends on BKL # need to fix
        help
          The Acorn Disc Filing System is the standard file system of the
          RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4a3af7075c1d..959dbff2d42d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -352,11 +352,15 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
        struct adfs_sb_info *asb;
        struct inode *root;
+        lock_kernel();
        sb->s_flags |= MS_NODIRATIME;
        asb = kzalloc(sizeof(*asb), GFP_KERNEL);
-        if (!asb)
+        if (!asb) {
+                unlock_kernel();
                return -ENOMEM;
+        }
        sb->s_fs_info = asb;
        /* set default options */
@@ -474,6 +478,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                goto error;
        } else
                sb->s_root->d_op = &adfs_dentry_operations;
+        unlock_kernel();
        return 0;
 error_free_bh:
@@ -481,20 +486,20 @@ error_free_bh:
 error:
        sb->s_fs_info = NULL;
        kfree(asb);
+        unlock_kernel();
        return -EINVAL;
 }
-static int adfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *adfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
-                           mnt);
 }
 static struct file_system_type adfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "adfs",
-        .get_sb         = adfs_get_sb,
+        .mount          = adfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/affs/file.c b/fs/affs/file.c
index c4a9875bd1a6..0a90dcd46de2 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -894,9 +894,9 @@ affs_truncate(struct inode *inode)
                if (AFFS_SB(sb)->s_flags & SF_OFS) {
                        struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
                        u32 tmp;
-                        if (IS_ERR(ext_bh)) {
+                        if (IS_ERR(bh)) {
                                affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)",
-                                             ext, PTR_ERR(ext_bh));
+                                             ext, PTR_ERR(bh));
                                return;
                        }
                        tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3a0fdec175ba..5d828903ac69 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -388,7 +388,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
                affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
                mark_buffer_dirty_inode(inode_bh, inode);
                inode->i_nlink = 2;
-                atomic_inc(&inode->i_count);
+                ihold(inode);
        }
        affs_fix_checksum(sb, bh);
        mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 9581ea94d5a1..0cf7f4384cbd 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,7 +16,6 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include "affs.h"
@@ -46,8 +45,6 @@ affs_put_super(struct super_block *sb)
        struct affs_sb_info *sbi = AFFS_SB(sb);
        pr_debug("AFFS: put_super()\n");
-        lock_kernel();
        if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt)
                affs_commit_super(sb, 1, 1);
@@ -56,8 +53,6 @@ affs_put_super(struct super_block *sb)
        affs_brelse(sbi->s_root_bh);
        kfree(sbi);
        sb->s_fs_info = NULL;
-        unlock_kernel();
 }
 static void
@@ -302,6 +297,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;
        mutex_init(&sbi->s_bmlock);
        spin_lock_init(&sbi->symlink_lock);
@@ -527,7 +523,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
                kfree(new_opts);
                return -EINVAL;
        }
-        lock_kernel();
        replace_mount_options(sb, new_opts);
        sbi->s_flags = mount_flags;
@@ -543,17 +539,15 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        memcpy(sbi->s_volume, volume, 32);
        spin_unlock(&sbi->symlink_lock);
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
-                unlock_kernel();
                return 0;
-        }
        if (*flags & MS_RDONLY) {
                affs_write_super(sb);
                affs_free_bitmap(sb);
        } else
                res = affs_init_bitmap(sb, flags);
-        unlock_kernel();
        return res;
 }
@@ -579,17 +573,16 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static int affs_get_sb(struct file_system_type *fs_type,
+static struct dentry *affs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
-                           mnt);
 }
 static struct file_system_type affs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "affs",
-        .get_sb         = affs_get_sb,
+        .mount          = affs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 0d38c09bd55e..5439e1bc9a86 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1045,7 +1045,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
        if (ret < 0)
                goto link_error;
-        atomic_inc(&vnode->vfs_inode.i_count);
+        ihold(&vnode->vfs_inode);
        d_instantiate(dentry, &vnode->vfs_inode);
        key_put(key);
        _leave(" = 0");
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 0931bc1325eb..757d664575dd 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -9,7 +9,6 @@
 * 2 of the License, or (at your option) any later version.
 */
-#include <linux/smp_lock.h>
 #include "internal.h"
 #define AFS_LOCK_GRANTED        0
@@ -274,7 +273,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
        type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
-        lock_kernel();
+        lock_flocks();
        /* make sure we've got a callback on this file and that our view of the
         * data version is up to date */
@@ -421,7 +420,7 @@ given_lock:
        afs_vnode_fetch_status(vnode, NULL, key);
 error:
-        unlock_kernel();
+        unlock_flocks();
        _leave(" = %d", ret);
        return ret;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6d552686c498..6153417caf57 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -29,6 +29,7 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work);
 const struct file_operations afs_mntpt_file_operations = {
        .open           = afs_mntpt_open,
+        .llseek         = noop_llseek,
 };
 const struct inode_operations afs_mntpt_inode_operations = {
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 77e1e5a61154..27201cffece4 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -19,7 +19,6 @@
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/parser.h>
@@ -30,9 +29,8 @@
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
 static void afs_i_init_once(void *foo);
-static int afs_get_sb(struct file_system_type *fs_type,
+static struct dentry *afs_mount(struct file_system_type *fs_type,
-                      int flags, const char *dev_name,
+                      int flags, const char *dev_name, void *data);
-                      void *data, struct vfsmount *mnt);
 static struct inode *afs_alloc_inode(struct super_block *sb);
 static void afs_put_super(struct super_block *sb);
 static void afs_destroy_inode(struct inode *inode);
@@ -41,7 +39,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
 struct file_system_type afs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "afs",
-        .get_sb         = afs_get_sb,
+        .mount          = afs_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = 0,
 };
@@ -360,11 +358,8 @@ error:
 /*
 * get an AFS superblock
 */
-static int afs_get_sb(struct file_system_type *fs_type,
+static struct dentry *afs_mount(struct file_system_type *fs_type,
-                      int flags,
+                      int flags, const char *dev_name, void *options)
-                      const char *dev_name,
-                      void *options,
-                      struct vfsmount *mnt)
 {
        struct afs_mount_params params;
        struct super_block *sb;
@@ -428,12 +423,11 @@ static int afs_get_sb(struct file_system_type *fs_type,
                ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
        }
-        simple_set_mnt(mnt, sb);
        afs_put_volume(params.volume);
        afs_put_cell(params.cell);
        kfree(new_opts);
        _leave(" = 0 [%p]", sb);
-        return 0;
+        return dget(sb->s_root);
 error:
        afs_put_volume(params.volume);
@@ -441,7 +435,7 @@ error:
        key_put(params.key);
        kfree(new_opts);
        _leave(" = %d", ret);
-        return ret;
+        return ERR_PTR(ret);
 }
 /*
@@ -453,12 +447,8 @@ static void afs_put_super(struct super_block *sb)
        _enter("");
-        lock_kernel();
        afs_put_volume(as->volume);
-        unlock_kernel();
        _leave("");
 }
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 722743b152d8..15690bb1d3b5 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -438,7 +438,6 @@ no_more:
 */
 int afs_writepage(struct page *page, struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = page->mapping->backing_dev_info;
        struct afs_writeback *wb;
        int ret;
@@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc)
        }
        wbc->nr_to_write -= ret;
-        if (wbc->nonblocking && bdi_write_congested(bdi))
-                wbc->encountered_congestion = 1;
        _leave(" = 0");
        return 0;
@@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping,
                                 struct writeback_control *wbc,
                                 pgoff_t index, pgoff_t end, pgoff_t *_next)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        struct afs_writeback *wb;
        struct page *page;
        int ret, n;
@@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping,
                wbc->nr_to_write -= ret;
-                if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                        wbc->encountered_congestion = 1;
-                        break;
-                }
                cond_resched();
        } while (index < end && wbc->nr_to_write > 0);
@@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping,
 int afs_writepages(struct address_space *mapping,
                   struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        pgoff_t start, end, next;
        int ret;
        _enter("");
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                wbc->encountered_congestion = 1;
-                _leave(" = 0 [congest]");
-                return 0;
-        }
        if (wbc->range_cyclic) {
                start = mapping->writeback_index;
                end = -1;
                ret = afs_writepages_region(mapping, wbc, start, end, &next);
-                if (start > 0 && wbc->nr_to_write > 0 && ret == 0 &&
+                if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
-                    !(wbc->nonblocking && wbc->encountered_congestion))
                        ret = afs_writepages_region(mapping, wbc, 0, start,
                                                    &next);
                mapping->writeback_index = next;
diff --git a/fs/aio.c b/fs/aio.c
index 250b0a73c8a8..8c8f6c5b6d79 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1543,7 +1543,19 @@ static void aio_batch_add(struct address_space *mapping,
        }
        abe = mempool_alloc(abe_pool, GFP_KERNEL);
-        BUG_ON(!igrab(mapping->host));
+        /*
+         * we should be using igrab here, but
+         * we don't want to hammer on the global
+         * inode spinlock just to take an extra
+         * reference on a file that we must already
+         * have a reference to.
+         *
+         * When we're called, we always have a reference
+         * on the file, so we must always have a reference
+         * on the inode, so ihold() is safe here.
+         */
+        ihold(mapping->host);
        abe->mapping = mapping;
        hlist_add_head(&abe->list, &batch_hash[bucket]);
        return;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda83..57ce55b2564c 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,12 +26,10 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
 static struct inode *anon_inode_inode;
 static const struct file_operations anon_inode_fops;
-static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
-                               const char *dev_name, void *data,
+                                int flags, const char *dev_name, void *data)
-                               struct vfsmount *mnt)
 {
-        return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC,
+        return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC);
-                             mnt);
 }
 /*
@@ -45,7 +43,7 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
 static struct file_system_type anon_inode_fs_type = {
        .name           = "anon_inodefs",
-        .get_sb         = anon_inodefs_get_sb,
+        .mount          = anon_inodefs_mount,
        .kill_sb        = kill_anon_super,
 };
 static const struct dentry_operations anon_inodefs_dentry_operations = {
@@ -111,10 +109,9 @@ struct file *anon_inode_getfile(const char *name,
        path.mnt = mntget(anon_inode_mnt);
        /*
         * We know the anon_inode inode count is always greater than zero,
-         * so we can avoid doing an igrab() and we can use an open-coded
+         * so ihold() is safe.
-         * atomic_inc().
         */
-        atomic_inc(&anon_inode_inode->i_count);
+        ihold(anon_inode_inode);
        path.dentry->d_op = &anon_inodefs_dentry_operations;
        d_instantiate(path.dentry, anon_inode_inode);
@@ -194,6 +191,7 @@ static struct inode *anon_inode_mkinode(void)
        if (!inode)
                return ERR_PTR(-ENOMEM);
+        inode->i_ino = get_next_ino();
        inode->i_fop = &anon_inode_fops;
        inode->i_mapping->a_ops = &anon_aops;
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
deleted file mode 100644
index 5f3bea90911e..000000000000
--- a/fs/autofs/Kconfig
+++ /dev/null
@@ -1,21 +0,0 @@
-config AUTOFS_FS
-        tristate "Kernel automounter support"
-        help
-          The automounter is a tool to automatically mount remote file systems
-          on demand. This implementation is partially kernel-based to reduce
-          overhead in the already-mounted case; this is unlike the BSD
-          automounter (amd), which is a pure user space daemon.
-          To use the automounter you need the user-space tools from the autofs
-          package; you can find the location in <file:Documentation/Changes>.
-          You also want to answer Y to "NFS file system support", below.
-          If you want to use the newer version of the automounter with more
-          features, say N here and say Y to "Kernel automounter v4 support",
-          below.
-          To compile this support as a module, choose M here: the module will be
-          called autofs.
-          If you are not a part of a fairly large, distributed network, you
-          probably do not need an automounter, and can say N here.
diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile
deleted file mode 100644
index 453a60f46d05..000000000000
--- a/fs/autofs/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the linux autofs-filesystem routines.
-#
-obj-$(CONFIG_AUTOFS_FS) += autofs.o
-autofs-objs := dirhash.o init.o inode.o root.o symlink.o waitq.o
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
deleted file mode 100644
index 901a3e67ec45..000000000000
--- a/fs/autofs/autofs_i.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *   
- * linux/fs/autofs/autofs_i.h
- *
- *   Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-/* Internal header file for autofs */
-#include <linux/auto_fs.h>
-/* This is the range of ioctl() numbers we claim as ours */
-#define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
-#define AUTOFS_IOC_COUNT     32
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/string.h>
-#include <linux/wait.h>
-#include <linux/dcache.h>
-#include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/sched.h>
-#include <asm/current.h>
-#include <asm/uaccess.h>
-#ifdef DEBUG
-#define DPRINTK(D) (printk D)
-#else
-#define DPRINTK(D) ((void)0)
-#endif
-/*
- * If the daemon returns a negative response (AUTOFS_IOC_FAIL) then the
- * kernel will keep the negative response cached for up to the time given
- * here, although the time can be shorter if the kernel throws the dcache
- * entry away.  This probably should be settable from user space.
- */
-#define AUTOFS_NEGATIVE_TIMEOUT (60*HZ) /* 1 minute */
-/* Structures associated with the root directory hash table */
-#define AUTOFS_HASH_SIZE 67
-struct autofs_dir_ent {
-        int hash;
-        char *name;
-        int len;
-        ino_t ino;
-        struct dentry *dentry;
-        /* Linked list of entries */
-        struct autofs_dir_ent *next;
-        struct autofs_dir_ent **back;
-        /* The following entries are for the expiry system */
-        unsigned long last_usage;
-        struct list_head exp;
-};
-struct autofs_dirhash {
-        struct autofs_dir_ent *h[AUTOFS_HASH_SIZE];
-        struct list_head expiry_head;
-};
-struct autofs_wait_queue {
-        wait_queue_head_t queue;
-        struct autofs_wait_queue *next;
-        autofs_wqt_t wait_queue_token;
-        /* We use the following to see what we are waiting for */
-        int hash;
-        int len;
-        char *name;
-        /* This is for status reporting upon return */
-        int status;
-        int wait_ctr;
-};
-struct autofs_symlink {
-        char *data;
-        int len;
-        time_t mtime;
-};
-#define AUTOFS_MAX_SYMLINKS 256
-#define AUTOFS_ROOT_INO      1
-#define AUTOFS_FIRST_SYMLINK 2
-#define AUTOFS_FIRST_DIR_INO (AUTOFS_FIRST_SYMLINK+AUTOFS_MAX_SYMLINKS)
-#define AUTOFS_SYMLINK_BITMAP_LEN \
-        ((AUTOFS_MAX_SYMLINKS+((sizeof(long)*1)-1))/(sizeof(long)*8))
-#define AUTOFS_SBI_MAGIC 0x6d4a556d
-struct autofs_sb_info {
-        u32 magic;
-        struct file *pipe;
-        struct pid *oz_pgrp;
-        int catatonic;
-        struct super_block *sb;
-        unsigned long exp_timeout;
-        ino_t next_dir_ino;
-        struct autofs_wait_queue *queues; /* Wait queue pointer */
-        struct autofs_dirhash dirhash; /* Root directory hash */
-        struct autofs_symlink symlink[AUTOFS_MAX_SYMLINKS];
-        unsigned long symlink_bitmap[AUTOFS_SYMLINK_BITMAP_LEN];
-};
-static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
-{
-        return (struct autofs_sb_info *)(sb->s_fs_info);
-}
-/* autofs_oz_mode(): do we see the man behind the curtain?  (The
-   processes which do manipulations for us in user space sees the raw
-   filesystem without "magic".) */
-static inline int autofs_oz_mode(struct autofs_sb_info *sbi) {
-        return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
-}
-/* Hash operations */
-void autofs_initialize_hash(struct autofs_dirhash *);
-struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *,struct qstr *);
-void autofs_hash_insert(struct autofs_dirhash *,struct autofs_dir_ent *);
-void autofs_hash_delete(struct autofs_dir_ent *);
-struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *,off_t *,struct autofs_dir_ent *);
-void autofs_hash_dputall(struct autofs_dirhash *);
-void autofs_hash_nuke(struct autofs_sb_info *);
-/* Expiration-handling functions */
-void autofs_update_usage(struct autofs_dirhash *,struct autofs_dir_ent *);
-struct autofs_dir_ent *autofs_expire(struct super_block *,struct autofs_sb_info *, struct vfsmount *mnt);
-/* Operations structures */
-extern const struct inode_operations autofs_root_inode_operations;
-extern const struct inode_operations autofs_symlink_inode_operations;
-extern const struct file_operations autofs_root_operations;
-/* Initializing function */
-int autofs_fill_super(struct super_block *, void *, int);
-void autofs_kill_sb(struct super_block *sb);
-struct inode *autofs_iget(struct super_block *, unsigned long);
-/* Queue management functions */
-int autofs_wait(struct autofs_sb_info *,struct qstr *);
-int autofs_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
-void autofs_catatonic_mode(struct autofs_sb_info *);
-#ifdef DEBUG
-void autofs_say(const char *name, int len);
-#else
-#define autofs_say(n,l) ((void)0)
-#endif
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
deleted file mode 100644
index e947915109e5..000000000000
--- a/fs/autofs/dirhash.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/dirhash.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include "autofs_i.h"
-/* Functions for maintenance of expiry queue */
-static void autofs_init_usage(struct autofs_dirhash *dh,
-                              struct autofs_dir_ent *ent)
-{
-        list_add_tail(&ent->exp, &dh->expiry_head);
-        ent->last_usage = jiffies;
-}
-static void autofs_delete_usage(struct autofs_dir_ent *ent)
-{
-        list_del(&ent->exp);
-}
-void autofs_update_usage(struct autofs_dirhash *dh,
-                         struct autofs_dir_ent *ent)
-{
-        autofs_delete_usage(ent);   /* Unlink from current position */
-        autofs_init_usage(dh,ent);  /* Relink at queue tail */
-}
-struct autofs_dir_ent *autofs_expire(struct super_block *sb,
-                                     struct autofs_sb_info *sbi,
-                                     struct vfsmount *mnt)
-{
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        unsigned long timeout = sbi->exp_timeout;
-        while (1) {
-                struct path path;
-                int umount_ok;
-                if ( list_empty(&dh->expiry_head) || sbi->catatonic )
-                        return NULL;    /* No entries */
-                /* We keep the list sorted by last_usage and want old stuff */
-                ent = list_entry(dh->expiry_head.next, struct autofs_dir_ent, exp);
-                if (jiffies - ent->last_usage < timeout)
-                        break;
-                /* Move to end of list in case expiry isn't desirable */
-                autofs_update_usage(dh, ent);
-                /* Check to see that entry is expirable */
-                if ( ent->ino < AUTOFS_FIRST_DIR_INO )
-                        return ent; /* Symlinks are always expirable */
-                /* Get the dentry for the autofs subdirectory */
-                path.dentry = ent->dentry;
-                if (!path.dentry) {
-                        /* Should only happen in catatonic mode */
-                        printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name);
-                        autofs_delete_usage(ent);
-                        continue;
-                }
-                if (!path.dentry->d_inode) {
-                        dput(path.dentry);
-                        printk("autofs: negative dentry on expiry queue: %s\n",
-                               ent->name);
-                        autofs_delete_usage(ent);
-                        continue;
-                }
-                /* Make sure entry is mounted and unused; note that dentry will
-                   point to the mounted-on-top root. */
-                if (!S_ISDIR(path.dentry->d_inode->i_mode) ||
-                    !d_mountpoint(path.dentry)) {
-                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
-                        continue;
-                }
-                path.mnt = mnt;
-                path_get(&path);
-                if (!follow_down(&path)) {
-                        path_put(&path);
-                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
-                        continue;
-                }
-                while (d_mountpoint(path.dentry) && follow_down(&path))
-                        ;
-                umount_ok = may_umount(path.mnt);
-                path_put(&path);
-                if (umount_ok) {
-                        DPRINTK(("autofs: signaling expire on %s\n", ent->name));
-                        return ent; /* Expirable! */
-                }
-                DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name));
-        }
-        return NULL;            /* No expirable entries */
-}
-void autofs_initialize_hash(struct autofs_dirhash *dh) {
-        memset(&dh->h, 0, AUTOFS_HASH_SIZE*sizeof(struct autofs_dir_ent *));
-        INIT_LIST_HEAD(&dh->expiry_head);
-}
-struct autofs_dir_ent *autofs_hash_lookup(const struct autofs_dirhash *dh, struct qstr *name)
-{
-        struct autofs_dir_ent *dhn;
-        DPRINTK(("autofs_hash_lookup: hash = 0x%08x, name = ", name->hash));
-        autofs_say(name->name,name->len);
-        for ( dhn = dh->h[(unsigned) name->hash % AUTOFS_HASH_SIZE] ; dhn ; dhn = dhn->next ) {
-                if ( name->hash == dhn->hash &&
-                     name->len == dhn->len &&
-                     !memcmp(name->name, dhn->name, name->len) )
-                        break;
-        }
-        return dhn;
-}
-void autofs_hash_insert(struct autofs_dirhash *dh, struct autofs_dir_ent *ent)
-{
-        struct autofs_dir_ent **dhnp;
-        DPRINTK(("autofs_hash_insert: hash = 0x%08x, name = ", ent->hash));
-        autofs_say(ent->name,ent->len);
-        autofs_init_usage(dh,ent);
-        if (ent->dentry)
-                dget(ent->dentry);
-        dhnp = &dh->h[(unsigned) ent->hash % AUTOFS_HASH_SIZE];
-        ent->next = *dhnp;
-        ent->back = dhnp;
-        *dhnp = ent;
-        if ( ent->next )
-                ent->next->back = &(ent->next);
-}
-void autofs_hash_delete(struct autofs_dir_ent *ent)
-{
-        *(ent->back) = ent->next;
-        if ( ent->next )
-                ent->next->back = ent->back;
-        autofs_delete_usage(ent);
-        if ( ent->dentry )
-                dput(ent->dentry);
-        kfree(ent->name);
-        kfree(ent);
-}
-/*
- * Used by readdir().  We must validate "ptr", so we can't simply make it
- * a pointer.  Values below 0xffff are reserved; calling with any value
- * <= 0x10000 will return the first entry found.
- *
- * "last" can be NULL or the value returned by the last search *if* we
- * want the next sequential entry.
- */
-struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *dh,
-                                        off_t *ptr, struct autofs_dir_ent *last)
-{
-        int bucket, ecount, i;
-        struct autofs_dir_ent *ent;
-        bucket = (*ptr >> 16) - 1;
-        ecount = *ptr & 0xffff;
-        if ( bucket < 0 ) {
-                bucket = ecount = 0;
-        } 
-        DPRINTK(("autofs_hash_enum: bucket %d, entry %d\n", bucket, ecount));
-        ent = last ? last->next : NULL;
-        if ( ent ) {
-                ecount++;
-        } else {
-                while  ( bucket < AUTOFS_HASH_SIZE ) {
-                        ent = dh->h[bucket];
-                        for ( i = ecount ; ent && i ; i-- )
-                                ent = ent->next;
-                        
-                        if (ent) {
-                                ecount++; /* Point to *next* entry */
-                                break;
-                        }
-                        
-                        bucket++; ecount = 0;
-                }
-        }
-#ifdef DEBUG
-        if ( !ent )
-                printk("autofs_hash_enum: nothing found\n");
-        else {
-                printk("autofs_hash_enum: found hash %08x, name", ent->hash);
-                autofs_say(ent->name,ent->len);
-        }
-#endif
-        *ptr = ((bucket+1) << 16) + ecount;
-        return ent;
-}
-/* Iterate over all the ents, and remove all dentry pointers.  Used on
-   entering catatonic mode, in order to make the filesystem unmountable. */
-void autofs_hash_dputall(struct autofs_dirhash *dh)
-{
-        int i;
-        struct autofs_dir_ent *ent;
-        for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
-                for ( ent = dh->h[i] ; ent ; ent = ent->next ) {
-                        if ( ent->dentry ) {
-                                dput(ent->dentry);
-                                ent->dentry = NULL;
-                        }
-                }
-        }
-}
-/* Delete everything.  This is used on filesystem destruction, so we
-   make no attempt to keep the pointers valid */
-void autofs_hash_nuke(struct autofs_sb_info *sbi)
-{
-        int i;
-        struct autofs_dir_ent *ent, *nent;
-        for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
-                for ( ent = sbi->dirhash.h[i] ; ent ; ent = nent ) {
-                        nent = ent->next;
-                        if ( ent->dentry )
-                                dput(ent->dentry);
-                        kfree(ent->name);
-                        kfree(ent);
-                }
-        }
-}
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
deleted file mode 100644
index cea5219b4f37..000000000000
--- a/fs/autofs/init.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/init.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include "autofs_i.h"
-static int autofs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
-}
-static struct file_system_type autofs_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "autofs",
-        .get_sb         = autofs_get_sb,
-        .kill_sb        = autofs_kill_sb,
-};
-static int __init init_autofs_fs(void)
-{
-        return register_filesystem(&autofs_fs_type);
-}
-static void __exit exit_autofs_fs(void)
-{
-        unregister_filesystem(&autofs_fs_type);
-}
-module_init(init_autofs_fs);
-module_exit(exit_autofs_fs);
-#ifdef DEBUG
-void autofs_say(const char *name, int len)
-{
-        printk("(%d: ", len);
-        while ( len-- )
-                printk("%c", *name++);
-        printk(")\n");
-}
-#endif
-MODULE_LICENSE("GPL");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
deleted file mode 100644
index e1734f2d6e26..000000000000
--- a/fs/autofs/inode.c
+++ /dev/null
@@ -1,288 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/inode.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/file.h>
-#include <linux/parser.h>
-#include <linux/bitops.h>
-#include <linux/magic.h>
-#include "autofs_i.h"
-#include <linux/module.h>
-void autofs_kill_sb(struct super_block *sb)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(sb);
-        unsigned int n;
-        /*
-         * In the event of a failure in get_sb_nodev the superblock
-         * info is not present so nothing else has been setup, so
-         * just call kill_anon_super when we are called from
-         * deactivate_super.
-         */
-        if (!sbi)
-                goto out_kill_sb;
-        if (!sbi->catatonic)
-                autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
-        put_pid(sbi->oz_pgrp);
-        autofs_hash_nuke(sbi);
-        for (n = 0; n < AUTOFS_MAX_SYMLINKS; n++) {
-                if (test_bit(n, sbi->symlink_bitmap))
-                        kfree(sbi->symlink[n].data);
-        }
-        kfree(sb->s_fs_info);
-out_kill_sb:
-        DPRINTK(("autofs: shutting down\n"));
-        kill_anon_super(sb);
-}
-static const struct super_operations autofs_sops = {
-        .statfs         = simple_statfs,
-        .show_options   = generic_show_options,
-};
-enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
-static const match_table_t autofs_tokens = {
-        {Opt_fd, "fd=%u"},
-        {Opt_uid, "uid=%u"},
-        {Opt_gid, "gid=%u"},
-        {Opt_pgrp, "pgrp=%u"},
-        {Opt_minproto, "minproto=%u"},
-        {Opt_maxproto, "maxproto=%u"},
-        {Opt_err, NULL}
-};
-static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
-                pid_t *pgrp, int *minproto, int *maxproto)
-{
-        char *p;
-        substring_t args[MAX_OPT_ARGS];
-        int option;
-        *uid = current_uid();
-        *gid = current_gid();
-        *pgrp = task_pgrp_nr(current);
-        *minproto = *maxproto = AUTOFS_PROTO_VERSION;
-        *pipefd = -1;
-        if (!options)
-                return 1;
-        while ((p = strsep(&options, ",")) != NULL) {
-                int token;
-                if (!*p)
-                        continue;
-                token = match_token(p, autofs_tokens, args);
-                switch (token) {
-                case Opt_fd:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *pipefd = option;
-                        break;
-                case Opt_uid:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *uid = option;
-                        break;
-                case Opt_gid:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *gid = option;
-                        break;
-                case Opt_pgrp:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *pgrp = option;
-                        break;
-                case Opt_minproto:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *minproto = option;
-                        break;
-                case Opt_maxproto:
-                        if (match_int(&args[0], &option))
-                                return 1;
-                        *maxproto = option;
-                        break;
-                default:
-                        return 1;
-                }
-        }
-        return (*pipefd < 0);
-}
-int autofs_fill_super(struct super_block *s, void *data, int silent)
-{
-        struct inode * root_inode;
-        struct dentry * root;
-        struct file * pipe;
-        int pipefd;
-        struct autofs_sb_info *sbi;
-        int minproto, maxproto;
-        pid_t pgid;
-        save_mount_options(s, data);
-        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-        if (!sbi)
-                goto fail_unlock;
-        DPRINTK(("autofs: starting up, sbi = %p\n",sbi));
-        s->s_fs_info = sbi;
-        sbi->magic = AUTOFS_SBI_MAGIC;
-        sbi->pipe = NULL;
-        sbi->catatonic = 1;
-        sbi->exp_timeout = 0;
-        autofs_initialize_hash(&sbi->dirhash);
-        sbi->queues = NULL;
-        memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN);
-        sbi->next_dir_ino = AUTOFS_FIRST_DIR_INO;
-        s->s_blocksize = 1024;
-        s->s_blocksize_bits = 10;
-        s->s_magic = AUTOFS_SUPER_MAGIC;
-        s->s_op = &autofs_sops;
-        s->s_time_gran = 1;
-        sbi->sb = s;
-        root_inode = autofs_iget(s, AUTOFS_ROOT_INO);
-        if (IS_ERR(root_inode))
-                goto fail_free;
-        root = d_alloc_root(root_inode);
-        pipe = NULL;
-        if (!root)
-                goto fail_iput;
-        /* Can this call block?  - WTF cares? s is locked. */
-        if (parse_options(data, &pipefd, &root_inode->i_uid,
-                                &root_inode->i_gid, &pgid, &minproto,
-                                &maxproto)) {
-                printk("autofs: called with bogus options\n");
-                goto fail_dput;
-        }
-        /* Couldn't this be tested earlier? */
-        if (minproto > AUTOFS_PROTO_VERSION ||
-             maxproto < AUTOFS_PROTO_VERSION) {
-                printk("autofs: kernel does not match daemon version\n");
-                goto fail_dput;
-        }
-        DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, pgid));
-        sbi->oz_pgrp = find_get_pid(pgid);
-        if (!sbi->oz_pgrp) {
-                printk("autofs: could not find process group %d\n", pgid);
-                goto fail_dput;
-        }
-        pipe = fget(pipefd);
-        
-        if (!pipe) {
-                printk("autofs: could not open pipe file descriptor\n");
-                goto fail_put_pid;
-        }
-        if (!pipe->f_op || !pipe->f_op->write)
-                goto fail_fput;
-        sbi->pipe = pipe;
-        sbi->catatonic = 0;
-        /*
-         * Success! Install the root dentry now to indicate completion.
-         */
-        s->s_root = root;
-        return 0;
-fail_fput:
-        printk("autofs: pipe file descriptor does not contain proper ops\n");
-        fput(pipe);
-fail_put_pid:
-        put_pid(sbi->oz_pgrp);
-fail_dput:
-        dput(root);
-        goto fail_free;
-fail_iput:
-        printk("autofs: get root dentry failed\n");
-        iput(root_inode);
-fail_free:
-        kfree(sbi);
-        s->s_fs_info = NULL;
-fail_unlock:
-        return -EINVAL;
-}
-struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
-{
-        unsigned int n;
-        struct autofs_sb_info *sbi = autofs_sbi(sb);
-        struct inode *inode;
-        inode = iget_locked(sb, ino);
-        if (!inode)
-                return ERR_PTR(-ENOMEM);
-        if (!(inode->i_state & I_NEW))
-                return inode;
-        /* Initialize to the default case (stub directory) */
-        inode->i_op = &simple_dir_inode_operations;
-        inode->i_fop = &simple_dir_operations;
-        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
-        inode->i_nlink = 2;
-        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        if (ino == AUTOFS_ROOT_INO) {
-                inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
-                inode->i_op = &autofs_root_inode_operations;
-                inode->i_fop = &autofs_root_operations;
-                goto done;
-        } 
-        
-        inode->i_uid = inode->i_sb->s_root->d_inode->i_uid;
-        inode->i_gid = inode->i_sb->s_root->d_inode->i_gid;
-        
-        if (ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO) {
-                /* Symlink inode - should be in symlink list */
-                struct autofs_symlink *sl;
-                n = ino - AUTOFS_FIRST_SYMLINK;
-                if (n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) {
-                        printk("autofs: Looking for bad symlink inode %u\n", (unsigned int) ino);
-                        goto done;
-                }
-                
-                inode->i_op = &autofs_symlink_inode_operations;
-                sl = &sbi->symlink[n];
-                inode->i_private = sl;
-                inode->i_mode = S_IFLNK | S_IRWXUGO;
-                inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime;
-                inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
-                inode->i_size = sl->len;
-                inode->i_nlink = 1;
-        }
-done:
-        unlock_new_inode(inode);
-        return inode;
-}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
deleted file mode 100644
index 11b1ea786d00..000000000000
--- a/fs/autofs/root.c
+++ /dev/null
@@ -1,643 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/root.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/stat.h>
-#include <linux/slab.h>
-#include <linux/param.h>
-#include <linux/time.h>
-#include <linux/compat.h>
-#include <linux/smp_lock.h>
-#include "autofs_i.h"
-static int autofs_root_readdir(struct file *,void *,filldir_t);
-static struct dentry *autofs_root_lookup(struct inode *,struct dentry *, struct nameidata *);
-static int autofs_root_symlink(struct inode *,struct dentry *,const char *);
-static int autofs_root_unlink(struct inode *,struct dentry *);
-static int autofs_root_rmdir(struct inode *,struct dentry *);
-static int autofs_root_mkdir(struct inode *,struct dentry *,int);
-static long autofs_root_ioctl(struct file *,unsigned int,unsigned long);
-static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long);
-const struct file_operations autofs_root_operations = {
-        .llseek         = generic_file_llseek,
-        .read           = generic_read_dir,
-        .readdir        = autofs_root_readdir,
-        .unlocked_ioctl = autofs_root_ioctl,
-#ifdef CONFIG_COMPAT
-        .compat_ioctl   = autofs_root_compat_ioctl,
-#endif
-};
-const struct inode_operations autofs_root_inode_operations = {
-        .lookup         = autofs_root_lookup,
-        .unlink         = autofs_root_unlink,
-        .symlink        = autofs_root_symlink,
-        .mkdir          = autofs_root_mkdir,
-        .rmdir          = autofs_root_rmdir,
-};
-static int autofs_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-        struct autofs_dir_ent *ent = NULL;
-        struct autofs_dirhash *dirhash;
-        struct autofs_sb_info *sbi;
-        struct inode * inode = filp->f_path.dentry->d_inode;
-        off_t onr, nr;
-        lock_kernel();
-        sbi = autofs_sbi(inode->i_sb);
-        dirhash = &sbi->dirhash;
-        nr = filp->f_pos;
-        switch(nr)
-        {
-        case 0:
-                if (filldir(dirent, ".", 1, nr, inode->i_ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos = ++nr;
-                /* fall through */
-        case 1:
-                if (filldir(dirent, "..", 2, nr, inode->i_ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos = ++nr;
-                /* fall through */
-        default:
-                while (onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent)) {
-                        if (!ent->dentry || d_mountpoint(ent->dentry)) {
-                                if (filldir(dirent,ent->name,ent->len,onr,ent->ino,DT_UNKNOWN) < 0)
-                                        goto out;
-                                filp->f_pos = nr;
-                        }
-                }
-                break;
-        }
-out:
-        unlock_kernel();
-        return 0;
-}
-static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, struct autofs_sb_info *sbi)
-{
-        struct inode * inode;
-        struct autofs_dir_ent *ent;
-        int status = 0;
-        if (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name))) {
-                do {
-                        if (status && dentry->d_inode) {
-                                if (status != -ENOENT)
-                                        printk("autofs warning: lookup failure on positive dentry, status = %d, name = %s\n", status, dentry->d_name.name);
-                                return 0; /* Try to get the kernel to invalidate this dentry */
-                        }
-                        /* Turn this into a real negative dentry? */
-                        if (status == -ENOENT) {
-                                dentry->d_time = jiffies + AUTOFS_NEGATIVE_TIMEOUT;
-                                dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-                                return 1;
-                        } else if (status) {
-                                /* Return a negative dentry, but leave it "pending" */
-                                return 1;
-                        }
-                        status = autofs_wait(sbi, &dentry->d_name);
-                } while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)));
-        }
-        /* Abuse this field as a pointer to the directory entry, used to
-           find the expire list pointers */
-        dentry->d_time = (unsigned long) ent;
-        
-        if (!dentry->d_inode) {
-                inode = autofs_iget(sb, ent->ino);
-                if (IS_ERR(inode)) {
-                        /* Failed, but leave pending for next time */
-                        return 1;
-                }
-                dentry->d_inode = inode;
-        }
-        /* If this is a directory that isn't a mount point, bitch at the
-           daemon and fix it in user space */
-        if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
-                return !autofs_wait(sbi, &dentry->d_name);
-        }
-        /* We don't update the usages for the autofs daemon itself, this
-           is necessary for recursive autofs mounts */
-        if (!autofs_oz_mode(sbi)) {
-                autofs_update_usage(&sbi->dirhash,ent);
-        }
-        dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-        return 1;
-}
-/*
- * Revalidate is called on every cache lookup.  Some of those
- * cache lookups may actually happen while the dentry is not
- * yet completely filled in, and revalidate has to delay such
- * lookups..
- */
-static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
-{
-        struct inode * dir;
-        struct autofs_sb_info *sbi;
-        struct autofs_dir_ent *ent;
-        int res;
-        lock_kernel();
-        dir = dentry->d_parent->d_inode;
-        sbi = autofs_sbi(dir->i_sb);
-        /* Pending dentry */
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
-                if (autofs_oz_mode(sbi))
-                        res = 1;
-                else
-                        res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
-                unlock_kernel();
-                return res;
-        }
-        /* Negative dentry.. invalidate if "old" */
-        if (!dentry->d_inode) {
-                unlock_kernel();
-                return (dentry->d_time - jiffies <= AUTOFS_NEGATIVE_TIMEOUT);
-        }
-                
-        /* Check for a non-mountpoint directory */
-        if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
-                if (autofs_oz_mode(sbi))
-                        res = 1;
-                else
-                        res = try_to_fill_dentry(dentry, dir->i_sb, sbi);
-                unlock_kernel();
-                return res;
-        }
-        /* Update the usage list */
-        if (!autofs_oz_mode(sbi)) {
-                ent = (struct autofs_dir_ent *) dentry->d_time;
-                if (ent)
-                        autofs_update_usage(&sbi->dirhash,ent);
-        }
-        unlock_kernel();
-        return 1;
-}
-static const struct dentry_operations autofs_dentry_operations = {
-        .d_revalidate   = autofs_revalidate,
-};
-static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        struct autofs_sb_info *sbi;
-        int oz_mode;
-        DPRINTK(("autofs_root_lookup: name = "));
-        lock_kernel();
-        autofs_say(dentry->d_name.name,dentry->d_name.len);
-        if (dentry->d_name.len > NAME_MAX) {
-                unlock_kernel();
-                return ERR_PTR(-ENAMETOOLONG);/* File name too long to exist */
-        }
-        sbi = autofs_sbi(dir->i_sb);
-        oz_mode = autofs_oz_mode(sbi);
-        DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, "
-                                "oz_mode = %d\n", task_pid_nr(current),
-                                task_pgrp_nr(current), sbi->catatonic,
-                                oz_mode));
-        /*
-         * Mark the dentry incomplete, but add it. This is needed so
-         * that the VFS layer knows about the dentry, and we can count
-         * on catching any lookups through the revalidate.
-         *
-         * Let all the hard work be done by the revalidate function that
-         * needs to be able to do this anyway..
-         *
-         * We need to do this before we release the directory semaphore.
-         */
-        dentry->d_op = &autofs_dentry_operations;
-        dentry->d_flags |= DCACHE_AUTOFS_PENDING;
-        d_add(dentry, NULL);
-        mutex_unlock(&dir->i_mutex);
-        autofs_revalidate(dentry, nd);
-        mutex_lock(&dir->i_mutex);
-        /*
-         * If we are still pending, check if we had to handle
-         * a signal. If so we can force a restart..
-         */
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
-                /* See if we were interrupted */
-                if (signal_pending(current)) {
-                        sigset_t *sigset = &current->pending.signal;
-                        if (sigismember (sigset, SIGKILL) ||
-                            sigismember (sigset, SIGQUIT) ||
-                            sigismember (sigset, SIGINT)) {
-                                unlock_kernel();
-                                return ERR_PTR(-ERESTARTNOINTR);
-                        }
-                }
-        }
-        unlock_kernel();
-        /*
-         * If this dentry is unhashed, then we shouldn't honour this
-         * lookup even if the dentry is positive.  Returning ENOENT here
-         * doesn't do the right thing for all system calls, but it should
-         * be OK for the operations we permit from an autofs.
-         */
-        if (dentry->d_inode && d_unhashed(dentry))
-                return ERR_PTR(-ENOENT);
-        return NULL;
-}
-static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        unsigned int n;
-        int slsize;
-        struct autofs_symlink *sl;
-        struct inode *inode;
-        DPRINTK(("autofs_root_symlink: %s <- ", symname));
-        autofs_say(dentry->d_name.name,dentry->d_name.len);
-        lock_kernel();
-        if (!autofs_oz_mode(sbi)) {
-                unlock_kernel();
-                return -EACCES;
-        }
-        if (autofs_hash_lookup(dh, &dentry->d_name)) {
-                unlock_kernel();
-                return -EEXIST;
-        }
-        n = find_first_zero_bit(sbi->symlink_bitmap,AUTOFS_MAX_SYMLINKS);
-        if (n >= AUTOFS_MAX_SYMLINKS) {
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        set_bit(n,sbi->symlink_bitmap);
-        sl = &sbi->symlink[n];
-        sl->len = strlen(symname);
-        sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL);
-        if (!sl->data) {
-                clear_bit(n,sbi->symlink_bitmap);
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
-        if (!ent) {
-                kfree(sl->data);
-                clear_bit(n,sbi->symlink_bitmap);
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
-        if (!ent->name) {
-                kfree(sl->data);
-                kfree(ent);
-                clear_bit(n,sbi->symlink_bitmap);
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        memcpy(sl->data,symname,slsize);
-        sl->mtime = get_seconds();
-        ent->ino = AUTOFS_FIRST_SYMLINK + n;
-        ent->hash = dentry->d_name.hash;
-        memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
-        ent->dentry = NULL;     /* We don't keep the dentry for symlinks */
-        autofs_hash_insert(dh,ent);
-        inode = autofs_iget(dir->i_sb, ent->ino);
-        if (IS_ERR(inode))
-                return PTR_ERR(inode);
-        d_instantiate(dentry, inode);
-        unlock_kernel();
-        return 0;
-}
-/*
- * NOTE!
- *
- * Normal filesystems would do a "d_delete()" to tell the VFS dcache
- * that the file no longer exists. However, doing that means that the
- * VFS layer can turn the dentry into a negative dentry, which we
- * obviously do not want (we're dropping the entry not because it
- * doesn't exist, but because it has timed out).
- *
- * Also see autofs_root_rmdir()..
- */
-static int autofs_root_unlink(struct inode *dir, struct dentry *dentry)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        unsigned int n;
-        /* This allows root to remove symlinks */
-        lock_kernel();
-        if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
-                unlock_kernel();
-                return -EACCES;
-        }
-        ent = autofs_hash_lookup(dh, &dentry->d_name);
-        if (!ent) {
-                unlock_kernel();
-                return -ENOENT;
-        }
-        n = ent->ino - AUTOFS_FIRST_SYMLINK;
-        if (n >= AUTOFS_MAX_SYMLINKS) {
-                unlock_kernel();
-                return -EISDIR; /* It's a directory, dummy */
-        }
-        if (!test_bit(n,sbi->symlink_bitmap)) {
-                unlock_kernel();
-                return -EINVAL; /* Nonexistent symlink?  Shouldn't happen */
-        }
-        
-        dentry->d_time = (unsigned long)(struct autofs_dirhash *)NULL;
-        autofs_hash_delete(ent);
-        clear_bit(n,sbi->symlink_bitmap);
-        kfree(sbi->symlink[n].data);
-        d_drop(dentry);
-        
-        unlock_kernel();
-        return 0;
-}
-static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        lock_kernel();
-        if (!autofs_oz_mode(sbi)) {
-                unlock_kernel();
-                return -EACCES;
-        }
-        ent = autofs_hash_lookup(dh, &dentry->d_name);
-        if (!ent) {
-                unlock_kernel();
-                return -ENOENT;
-        }
-        if ((unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO) {
-                unlock_kernel();
-                return -ENOTDIR; /* Not a directory */
-        }
-        if (ent->dentry != dentry) {
-                printk("autofs_rmdir: odentry != dentry for entry %s\n", dentry->d_name.name);
-        }
-        dentry->d_time = (unsigned long)(struct autofs_dir_ent *)NULL;
-        autofs_hash_delete(ent);
-        drop_nlink(dir);
-        d_drop(dentry);
-        unlock_kernel();
-        return 0;
-}
-static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
-        struct autofs_dirhash *dh = &sbi->dirhash;
-        struct autofs_dir_ent *ent;
-        struct inode *inode;
-        ino_t ino;
-        lock_kernel();
-        if (!autofs_oz_mode(sbi)) {
-                unlock_kernel();
-                return -EACCES;
-        }
-        ent = autofs_hash_lookup(dh, &dentry->d_name);
-        if (ent) {
-                unlock_kernel();
-                return -EEXIST;
-        }
-        if (sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO) {
-                printk("autofs: Out of inode numbers -- what the heck did you do??\n");
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ino = sbi->next_dir_ino++;
-        ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
-        if (!ent) {
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
-        if (!ent->name) {
-                kfree(ent);
-                unlock_kernel();
-                return -ENOSPC;
-        }
-        ent->hash = dentry->d_name.hash;
-        memcpy(ent->name, dentry->d_name.name, 1+(ent->len = dentry->d_name.len));
-        ent->ino = ino;
-        ent->dentry = dentry;
-        autofs_hash_insert(dh,ent);
-        inc_nlink(dir);
-        inode = autofs_iget(dir->i_sb, ino);
-        if (IS_ERR(inode)) {
-                drop_nlink(dir);
-                return PTR_ERR(inode);
-        }
-        d_instantiate(dentry, inode);
-        unlock_kernel();
-        return 0;
-}
-/* Get/set timeout ioctl() operation */
-#ifdef CONFIG_COMPAT
-static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi,
-                                         unsigned int __user *p)
-{
-        unsigned long ntimeout;
-        if (get_user(ntimeout, p) ||
-            put_user(sbi->exp_timeout / HZ, p))
-                return -EFAULT;
-        if (ntimeout > UINT_MAX/HZ)
-                sbi->exp_timeout = 0;
-        else
-                sbi->exp_timeout = ntimeout * HZ;
-        return 0;
-}
-#endif
-static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
-                                         unsigned long __user *p)
-{
-        unsigned long ntimeout;
-        if (get_user(ntimeout, p) ||
-            put_user(sbi->exp_timeout / HZ, p))
-                return -EFAULT;
-        if (ntimeout > ULONG_MAX/HZ)
-                sbi->exp_timeout = 0;
-        else
-                sbi->exp_timeout = ntimeout * HZ;
-        return 0;
-}
-/* Return protocol version */
-static inline int autofs_get_protover(int __user *p)
-{
-        return put_user(AUTOFS_PROTO_VERSION, p);
-}
-/* Perform an expiry operation */
-static inline int autofs_expire_run(struct super_block *sb,
-                                    struct autofs_sb_info *sbi,
-                                    struct vfsmount *mnt,
-                                    struct autofs_packet_expire __user *pkt_p)
-{
-        struct autofs_dir_ent *ent;
-        struct autofs_packet_expire pkt;
-        memset(&pkt,0,sizeof pkt);
-        pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
-        pkt.hdr.type = autofs_ptype_expire;
-        if (!sbi->exp_timeout || !(ent = autofs_expire(sb,sbi,mnt)))
-                return -EAGAIN;
-        pkt.len = ent->len;
-        memcpy(pkt.name, ent->name, pkt.len);
-        pkt.name[pkt.len] = '\0';
-        if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
-                return -EFAULT;
-        return 0;
-}
-/*
- * ioctl()'s on the root directory is the chief method for the daemon to
- * generate kernel reactions
- */
-static int autofs_do_root_ioctl(struct inode *inode, struct file *filp,
-                             unsigned int cmd, unsigned long arg)
-{
-        struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
-        void __user *argp = (void __user *)arg;
-        DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,task_pgrp_nr(current)));
-        if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
-             _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
-                return -ENOTTY;
-        
-        if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
-                return -EPERM;
-        
-        switch(cmd) {
-        case AUTOFS_IOC_READY:  /* Wait queue: go ahead and retry */
-                return autofs_wait_release(sbi,(autofs_wqt_t)arg,0);
-        case AUTOFS_IOC_FAIL:   /* Wait queue: fail with ENOENT */
-                return autofs_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
-        case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
-                autofs_catatonic_mode(sbi);
-                return 0;
-        case AUTOFS_IOC_PROTOVER: /* Get protocol version */
-                return autofs_get_protover(argp);
-#ifdef CONFIG_COMPAT
-        case AUTOFS_IOC_SETTIMEOUT32:
-                return autofs_compat_get_set_timeout(sbi, argp);
-#endif
-        case AUTOFS_IOC_SETTIMEOUT:
-                return autofs_get_set_timeout(sbi, argp);
-        case AUTOFS_IOC_EXPIRE:
-                return autofs_expire_run(inode->i_sb, sbi, filp->f_path.mnt,
-                                         argp);
-        default:
-                return -ENOSYS;
-        }
-}
-static long autofs_root_ioctl(struct file *filp,
-                             unsigned int cmd, unsigned long arg)
-{
-        int ret;
-        lock_kernel();
-        ret = autofs_do_root_ioctl(filp->f_path.dentry->d_inode,
-                                   filp, cmd, arg);
-        unlock_kernel();
-        return ret;
-}
-#ifdef CONFIG_COMPAT
-static long autofs_root_compat_ioctl(struct file *filp,
-                             unsigned int cmd, unsigned long arg)
-{
-        struct inode *inode = filp->f_path.dentry->d_inode;
-        int ret;
-        lock_kernel();
-        if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
-                ret = autofs_do_root_ioctl(inode, filp, cmd, arg);
-        else
-                ret = autofs_do_root_ioctl(inode, filp, cmd,
-                        (unsigned long)compat_ptr(arg));
-        unlock_kernel();
-        return ret;
-}
-#endif
diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c
deleted file mode 100644
index 7ce9cb2c9ce2..000000000000
--- a/fs/autofs/symlink.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/symlink.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include "autofs_i.h"
-/* Nothing to release.. */
-static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-        char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data;
-        nd_set_link(nd, s);
-        return NULL;
-}
-const struct inode_operations autofs_symlink_inode_operations = {
-        .readlink       = generic_readlink,
-        .follow_link    = autofs_follow_link
-};
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
deleted file mode 100644
index be46805972f0..000000000000
--- a/fs/autofs/waitq.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/* -*- linux-c -*- --------------------------------------------------------- *
- *
- * linux/fs/autofs/waitq.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/signal.h>
-#include <linux/file.h>
-#include "autofs_i.h"
-/* We make this a static variable rather than a part of the superblock; it
-   is better if we don't reassign numbers easily even across filesystems */
-static autofs_wqt_t autofs_next_wait_queue = 1;
-/* These are the signals we allow interrupting a pending mount */
-#define SHUTDOWN_SIGS   (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT))
-void autofs_catatonic_mode(struct autofs_sb_info *sbi)
-{
-        struct autofs_wait_queue *wq, *nwq;
-        DPRINTK(("autofs: entering catatonic mode\n"));
-        sbi->catatonic = 1;
-        wq = sbi->queues;
-        sbi->queues = NULL;     /* Erase all wait queues */
-        while ( wq ) {
-                nwq = wq->next;
-                wq->status = -ENOENT; /* Magic is gone - report failure */
-                kfree(wq->name);
-                wq->name = NULL;
-                wake_up(&wq->queue);
-                wq = nwq;
-        }
-        fput(sbi->pipe);        /* Close the pipe */
-        sbi->pipe = NULL;
-        autofs_hash_dputall(&sbi->dirhash); /* Remove all dentry pointers */
-}
-static int autofs_write(struct file *file, const void *addr, int bytes)
-{
-        unsigned long sigpipe, flags;
-        mm_segment_t fs;
-        const char *data = (const char *)addr;
-        ssize_t wr = 0;
-        /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
-        sigpipe = sigismember(&current->pending.signal, SIGPIPE);
-        /* Save pointer to user space and point back to kernel space */
-        fs = get_fs();
-        set_fs(KERNEL_DS);
-        while (bytes &&
-               (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
-                data += wr;
-                bytes -= wr;
-        }
-        set_fs(fs);
-        /* Keep the currently executing process from receiving a
-           SIGPIPE unless it was already supposed to get one */
-        if (wr == -EPIPE && !sigpipe) {
-                spin_lock_irqsave(&current->sighand->siglock, flags);
-                sigdelset(&current->pending.signal, SIGPIPE);
-                recalc_sigpending();
-                spin_unlock_irqrestore(&current->sighand->siglock, flags);
-        }
-        return (bytes > 0);
-}
-        
-static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq)
-{
-        struct autofs_packet_missing pkt;
-        DPRINTK(("autofs_wait: wait id = 0x%08lx, name = ", wq->wait_queue_token));
-        autofs_say(wq->name,wq->len);
-        memset(&pkt,0,sizeof pkt); /* For security reasons */
-        pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
-        pkt.hdr.type = autofs_ptype_missing;
-        pkt.wait_queue_token = wq->wait_queue_token;
-        pkt.len = wq->len;
-        memcpy(pkt.name, wq->name, pkt.len);
-        pkt.name[pkt.len] = '\0';
-        if ( autofs_write(sbi->pipe,&pkt,sizeof(struct autofs_packet_missing)) )
-                autofs_catatonic_mode(sbi);
-}
-int autofs_wait(struct autofs_sb_info *sbi, struct qstr *name)
-{
-        struct autofs_wait_queue *wq;
-        int status;
-        /* In catatonic mode, we don't wait for nobody */
-        if ( sbi->catatonic )
-                return -ENOENT;
-        
-        /* We shouldn't be able to get here, but just in case */
-        if ( name->len > NAME_MAX )
-                return -ENOENT;
-        for ( wq = sbi->queues ; wq ; wq = wq->next ) {
-                if ( wq->hash == name->hash &&
-                     wq->len == name->len &&
-                     wq->name && !memcmp(wq->name,name->name,name->len) )
-                        break;
-        }
-        
-        if ( !wq ) {
-                /* Create a new wait queue */
-                wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
-                if ( !wq )
-                        return -ENOMEM;
-                wq->name = kmalloc(name->len,GFP_KERNEL);
-                if ( !wq->name ) {
-                        kfree(wq);
-                        return -ENOMEM;
-                }
-                wq->wait_queue_token = autofs_next_wait_queue++;
-                init_waitqueue_head(&wq->queue);
-                wq->hash = name->hash;
-                wq->len = name->len;
-                wq->status = -EINTR; /* Status return if interrupted */
-                memcpy(wq->name, name->name, name->len);
-                wq->next = sbi->queues;
-                sbi->queues = wq;
-                /* autofs_notify_daemon() may block */
-                wq->wait_ctr = 2;
-                autofs_notify_daemon(sbi,wq);
-        } else
-                wq->wait_ctr++;
-        /* wq->name is NULL if and only if the lock is already released */
-        if ( sbi->catatonic ) {
-                /* We might have slept, so check again for catatonic mode */
-                wq->status = -ENOENT;
-                kfree(wq->name);
-                wq->name = NULL;
-        }
-        if ( wq->name ) {
-                /* Block all but "shutdown" signals while waiting */
-                sigset_t sigmask;
-                siginitsetinv(&sigmask, SHUTDOWN_SIGS);
-                sigprocmask(SIG_BLOCK, &sigmask, &sigmask);
-                interruptible_sleep_on(&wq->queue);
-                sigprocmask(SIG_SETMASK, &sigmask, NULL);
-        } else {
-                DPRINTK(("autofs_wait: skipped sleeping\n"));
-        }
-        status = wq->status;
-        if ( ! --wq->wait_ctr ) /* Are we the last process to need status? */
-                kfree(wq);
-        return status;
-}
-int autofs_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
-{
-        struct autofs_wait_queue *wq, **wql;
-        for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
-                if ( wq->wait_queue_token == wait_queue_token )
-                        break;
-        }
-        if ( !wq )
-                return -EINVAL;
-        *wql = wq->next;        /* Unlink from chain */
-        kfree(wq->name);
-        wq->name = NULL;        /* Do not wait on this queue */
-        wq->status = status;
-        if ( ! --wq->wait_ctr ) /* Is anyone still waiting for this guy? */
-                kfree(wq);
-        else
-                wake_up(&wq->queue);
-        return 0;
-}
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ba4a38b9c22f..eff9a419469a 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -724,6 +724,7 @@ static const struct file_operations _dev_ioctl_fops = {
        .unlocked_ioctl  = autofs_dev_ioctl,
        .compat_ioctl = autofs_dev_ioctl_compat,
        .owner   = THIS_MODULE,
+        .llseek = noop_llseek,
 };
 static struct miscdevice _autofs_dev_ioctl_misc = {
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 9722e4bd8957..c038727b4050 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,16 +14,16 @@
 #include <linux/init.h>
 #include "autofs_i.h"
-static int autofs_get_sb(struct file_system_type *fs_type,
+static struct dentry *autofs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, autofs4_fill_super);
 }
 static struct file_system_type autofs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "autofs",
-        .get_sb         = autofs_get_sb,
+        .mount          = autofs_mount,
        .kill_sb        = autofs4_kill_sb,
 };
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 821b2b955dac..ac87e49fa706 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -398,6 +398,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
                inode->i_gid = sb->s_root->d_inode->i_gid;
        }
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        inode->i_ino = get_next_ino();
        if (S_ISDIR(inf->mode)) {
                inode->i_nlink = 2;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index cb1bd38dc08c..d5c1401f0031 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -19,7 +19,7 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/compat.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include "autofs_i.h"
@@ -28,7 +28,9 @@ static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
 static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+#ifdef CONFIG_COMPAT
 static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+#endif
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -978,15 +980,17 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
        }
 }
+static DEFINE_MUTEX(autofs4_ioctl_mutex);
 static long autofs4_root_ioctl(struct file *filp,
                               unsigned int cmd, unsigned long arg)
 {
        long ret;
        struct inode *inode = filp->f_dentry->d_inode;
-        lock_kernel();
+        mutex_lock(&autofs4_ioctl_mutex);
        ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
-        unlock_kernel();
+        mutex_unlock(&autofs4_ioctl_mutex);
        return ret;
 }
@@ -998,13 +1002,13 @@ static long autofs4_root_compat_ioctl(struct file *filp,
        struct inode *inode = filp->f_path.dentry->d_inode;
        int ret;
-        lock_kernel();
+        mutex_lock(&autofs4_ioctl_mutex);
        if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
        else
                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
                        (unsigned long)compat_ptr(arg));
-        unlock_kernel();
+        mutex_unlock(&autofs4_ioctl_mutex);
        return ret;
 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index dc39d2824885..aa4e7c7ae3c6 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -913,18 +913,17 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static int
+static struct dentry *
-befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name,
+befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name,
-            void *data, struct vfsmount *mnt)
+            void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super);
-                           mnt);
 }
 static struct file_system_type befs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "befs",
-        .get_sb         = befs_get_sb,
+        .mount          = befs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,      
 };
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d967e052b779..685ecff3ab31 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
        inc_nlink(inode);
        inode->i_ctime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(new, inode);
        mutex_unlock(&info->bfs_lock);
        return 0;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index c4daf0f5fc02..76db6d7d49bb 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -12,7 +12,6 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
@@ -215,14 +214,10 @@ static void bfs_put_super(struct super_block *s)
        if (!info)
                return;
-        lock_kernel();
        mutex_destroy(&info->bfs_lock);
        kfree(info->si_imap);
        kfree(info);
        s->s_fs_info = NULL;
-        unlock_kernel();
 }
 static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -455,16 +450,16 @@ out:
        return ret;
 }
-static int bfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *bfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
 }
 static struct file_system_type bfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "bfs",
-        .get_sb         = bfs_get_sb,
+        .mount          = bfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index fd0cc0bf9a40..1befe2ec8186 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -495,6 +495,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
        struct inode * inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_atime = inode->i_mtime = inode->i_ctime =
                        current_fs_time(inode->i_sb);
@@ -576,6 +577,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 static const struct file_operations bm_entry_operations = {
        .read           = bm_entry_read,
        .write          = bm_entry_write,
+        .llseek         = default_llseek,
 };
 /* /register */
@@ -643,6 +645,7 @@ out:
 static const struct file_operations bm_register_operations = {
        .write          = bm_register_write,
+        .llseek         = noop_llseek,
 };
 /* /status */
@@ -680,6 +683,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 static const struct file_operations bm_status_operations = {
        .read           = bm_status_read,
        .write          = bm_status_write,
+        .llseek         = default_llseek,
 };
 /* Superblock handling */
@@ -702,10 +706,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
        return err;
 }
-static int bm_get_sb(struct file_system_type *fs_type,
+static struct dentry *bm_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, bm_fill_super, mnt);
+        return mount_single(fs_type, flags, data, bm_fill_super);
 }
 static struct linux_binfmt misc_format = {
@@ -716,7 +720,7 @@ static struct linux_binfmt misc_format = {
 static struct file_system_type bm_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "binfmt_misc",
-        .get_sb         = bm_get_sb,
+        .mount          = bm_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/bio.c b/fs/bio.c
index 8abb2dfb2e7c..4bd454fa844e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
 {
        struct bio *bio;
+        if (nr_iovecs > UIO_MAXIOV)
+                return NULL;
        bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
                      gfp_mask);
        if (unlikely(!bio))
@@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd)
 static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
                                               gfp_t gfp_mask)
 {
-        struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);
+        struct bio_map_data *bmd;
+        if (iov_count > UIO_MAXIOV)
+                return NULL;
+        bmd = kmalloc(sizeof(*bmd), gfp_mask);
        if (!bmd)
                return NULL;
@@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
                end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
                start = uaddr >> PAGE_SHIFT;
+                /*
+                 * Overflow, abort
+                 */
+                if (end < start)
+                        return ERR_PTR(-EINVAL);
                nr_pages += end - start;
                len += iov[i].iov_len;
        }
@@ -955,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
                unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
                unsigned long start = uaddr >> PAGE_SHIFT;
+                /*
+                 * Overflow, abort
+                 */
+                if (end < start)
+                        return ERR_PTR(-EINVAL);
                nr_pages += end - start;
                /*
                 * buffer must be aligned to at least hardsector size for now
@@ -982,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
                unsigned long start = uaddr >> PAGE_SHIFT;
                const int local_nr_pages = end - start;
                const int page_limit = cur_page + local_nr_pages;
-                
                ret = get_user_pages_fast(uaddr, local_nr_pages,
                                write_to_vm, &pages[cur_page]);
                if (ret < local_nr_pages) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 50e8c8582faa..06e8ff12b97c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -48,6 +48,21 @@ inline struct block_device *I_BDEV(struct inode *inode)
 EXPORT_SYMBOL(I_BDEV);
+/*
+ * move the inode from it's current bdi to the a new bdi. if the inode is dirty
+ * we need to move it onto the dirty list of @dst so that the inode is always
+ * on the right list.
+ */
+static void bdev_inode_switch_bdi(struct inode *inode,
+                        struct backing_dev_info *dst)
+{
+        spin_lock(&inode_lock);
+        inode->i_data.backing_dev_info = dst;
+        if (inode->i_state & I_DIRTY)
+                list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+        spin_unlock(&inode_lock);
+}
 static sector_t max_block(struct block_device *bdev)
 {
        sector_t retval = ~((sector_t)0);
@@ -370,7 +385,7 @@ int blkdev_fsync(struct file *filp, int datasync)
         */
        mutex_unlock(&bd_inode->i_mutex);
-        error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
+        error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
        if (error == -EOPNOTSUPP)
                error = 0;
@@ -449,15 +464,15 @@ static const struct super_operations bdev_sops = {
        .evict_inode = bdev_evict_inode,
 };
-static int bd_get_sb(struct file_system_type *fs_type,
+static struct dentry *bd_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
+        return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
 }
 static struct file_system_type bd_type = {
        .name           = "bdev",
-        .get_sb         = bd_get_sb,
+        .mount          = bd_mount,
        .kill_sb        = kill_anon_super,
 };
@@ -550,7 +565,7 @@ EXPORT_SYMBOL(bdget);
 */
 struct block_device *bdgrab(struct block_device *bdev)
 {
-        atomic_inc(&bdev->bd_inode->i_count);
+        ihold(bdev->bd_inode);
        return bdev;
 }
@@ -580,7 +595,7 @@ static struct block_device *bd_acquire(struct inode *inode)
        spin_lock(&bdev_lock);
        bdev = inode->i_bdev;
        if (bdev) {
-                atomic_inc(&bdev->bd_inode->i_count);
+                ihold(bdev->bd_inode);
                spin_unlock(&bdev_lock);
                return bdev;
        }
@@ -591,12 +606,12 @@ static struct block_device *bd_acquire(struct inode *inode)
                spin_lock(&bdev_lock);
                if (!inode->i_bdev) {
                        /*
-                         * We take an additional bd_inode->i_count for inode,
+                         * We take an additional reference to bd_inode,
                         * and it's released in clear_inode() of inode.
                         * So, we can access it via ->i_mapping always
                         * without igrab().
                         */
-                        atomic_inc(&bdev->bd_inode->i_count);
+                        ihold(bdev->bd_inode);
                        inode->i_bdev = bdev;
                        inode->i_mapping = bdev->bd_inode->i_mapping;
                        list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -1390,7 +1405,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                bdi = blk_get_backing_dev_info(bdev);
                                if (bdi == NULL)
                                        bdi = &default_backing_dev_info;
-                                bdev->bd_inode->i_data.backing_dev_info = bdi;
+                                bdev_inode_switch_bdi(bdev->bd_inode, bdi);
                        }
                        if (bdev->bd_invalidated)
                                rescan_partitions(disk, bdev);
@@ -1405,8 +1420,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        if (ret)
                                goto out_clear;
                        bdev->bd_contains = whole;
-                        bdev->bd_inode->i_data.backing_dev_info =
+                        bdev_inode_switch_bdi(bdev->bd_inode,
-                           whole->bd_inode->i_data.backing_dev_info;
+                                whole->bd_inode->i_data.backing_dev_info);
                        bdev->bd_part = disk_get_part(disk, partno);
                        if (!(disk->flags & GENHD_FL_UP) ||
                            !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1439,7 +1454,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        disk_put_part(bdev->bd_part);
        bdev->bd_disk = NULL;
        bdev->bd_part = NULL;
-        bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+        bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
        if (bdev != bdev->bd_contains)
                __blkdev_put(bdev->bd_contains, mode, 1);
        bdev->bd_contains = NULL;
@@ -1533,7 +1548,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                disk_put_part(bdev->bd_part);
                bdev->bd_part = NULL;
                bdev->bd_disk = NULL;
-                bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+                bdev_inode_switch_bdi(bdev->bd_inode,
+                                        &default_backing_dev_info);
                if (bdev != bdev->bd_contains)
                        victim = bdev->bd_contains;
                bdev->bd_contains = NULL;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 396039b3a8a2..7845d1f7d1d9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -163,7 +163,6 @@ fail:
 */
 static void end_compressed_bio_read(struct bio *bio, int err)
 {
-        struct extent_io_tree *tree;
        struct compressed_bio *cb = bio->bi_private;
        struct inode *inode;
        struct page *page;
@@ -187,7 +186,6 @@ static void end_compressed_bio_read(struct bio *bio, int err)
        /* ok, we're the last bio for this extent, lets start
         * the decompression.
         */
-        tree = &BTRFS_I(inode)->io_tree;
        ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
                                        cb->start,
                                        cb->orig_bio->bi_io_vec,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c3df14ce2cc2..9ac171599258 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -200,7 +200,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
                      struct extent_buffer **cow_ret, u64 new_root_objectid)
 {
        struct extent_buffer *cow;
-        u32 nritems;
        int ret = 0;
        int level;
        struct btrfs_disk_key disk_key;
@@ -210,7 +209,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
        level = btrfs_header_level(buf);
-        nritems = btrfs_header_nritems(buf);
        if (level == 0)
                btrfs_item_key(buf, &disk_key, 0);
        else
@@ -1008,7 +1006,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
        int wret;
        int pslot;
        int orig_slot = path->slots[level];
-        int err_on_enospc = 0;
        u64 orig_ptr;
        if (level == 0)
@@ -1071,8 +1068,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
                return 0;
-        if (btrfs_header_nritems(mid) < 2)
+        btrfs_header_nritems(mid);
-                err_on_enospc = 1;
        left = read_node_slot(root, parent, pslot - 1);
        if (left) {
@@ -1103,8 +1099,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                wret = push_node_left(trans, root, left, mid, 1);
                if (wret < 0)
                        ret = wret;
-                if (btrfs_header_nritems(mid) < 2)
+                btrfs_header_nritems(mid);
-                        err_on_enospc = 1;
        }
        /*
@@ -1224,14 +1219,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
        int wret;
        int pslot;
        int orig_slot = path->slots[level];
-        u64 orig_ptr;
        if (level == 0)
                return 1;
        mid = path->nodes[level];
        WARN_ON(btrfs_header_generation(mid) != trans->transid);
-        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
        if (level < BTRFS_MAX_LEVEL - 1)
                parent = path->nodes[level + 1];
@@ -1577,13 +1570,33 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        blocksize = btrfs_level_size(root, level - 1);
        tmp = btrfs_find_tree_block(root, blocknr, blocksize);
-        if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+        if (tmp) {
-                /*
+                if (btrfs_buffer_uptodate(tmp, 0)) {
-                 * we found an up to date block without sleeping, return
+                        if (btrfs_buffer_uptodate(tmp, gen)) {
-                 * right away
+                                /*
-                 */
+                                 * we found an up to date block without
-                *eb_ret = tmp;
+                                 * sleeping, return
-                return 0;
+                                 * right away
+                                 */
+                                *eb_ret = tmp;
+                                return 0;
+                        }
+                        /* the pages were up to date, but we failed
+                         * the generation number check.  Do a full
+                         * read for the generation number that is correct.
+                         * We must do this without dropping locks so
+                         * we can trust our generation number
+                         */
+                        free_extent_buffer(tmp);
+                        tmp = read_tree_block(root, blocknr, blocksize, gen);
+                        if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+                                *eb_ret = tmp;
+                                return 0;
+                        }
+                        free_extent_buffer(tmp);
+                        btrfs_release_path(NULL, p);
+                        return -EIO;
+                }
        }
        /*
@@ -1596,8 +1609,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        btrfs_unlock_up_safe(p, level + 1);
        btrfs_set_path_blocking(p);
-        if (tmp)
+        free_extent_buffer(tmp);
-                free_extent_buffer(tmp);
        if (p->reada)
                reada_for_search(root, p, level, slot, key->objectid);
@@ -2548,7 +2560,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 {
        struct btrfs_disk_key disk_key;
        struct extent_buffer *right = path->nodes[0];
-        int slot;
        int i;
        int push_space = 0;
        int push_items = 0;
@@ -2560,8 +2571,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        u32 this_item_size;
        u32 old_left_item_size;
-        slot = path->slots[1];
        if (empty)
                nr = min(right_nritems, max_slot);
        else
@@ -3330,7 +3339,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 {
        int ret = 0;
        int slot;
-        int slot_orig;
        struct extent_buffer *leaf;
        struct btrfs_item *item;
        u32 nritems;
@@ -3340,7 +3348,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
        unsigned int size_diff;
        int i;
-        slot_orig = path->slots[0];
        leaf = path->nodes[0];
        slot = path->slots[0];
@@ -3445,7 +3452,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 {
        int ret = 0;
        int slot;
-        int slot_orig;
        struct extent_buffer *leaf;
        struct btrfs_item *item;
        u32 nritems;
@@ -3454,7 +3460,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
        unsigned int old_size;
        int i;
-        slot_orig = path->slots[0];
        leaf = path->nodes[0];
        nritems = btrfs_header_nritems(leaf);
@@ -3787,7 +3792,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                            struct btrfs_key *cpu_key, u32 *data_size,
                            int nr)
 {
-        struct extent_buffer *leaf;
        int ret = 0;
        int slot;
        int i;
@@ -3804,7 +3808,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
        if (ret < 0)
                goto out;
-        leaf = path->nodes[0];
        slot = path->slots[0];
        BUG_ON(slot < 0);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eaf286abad17..8db9234f6b41 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -99,6 +99,9 @@ struct btrfs_ordered_sum;
 */
 #define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+/* For storing free space cache */
+#define BTRFS_FREE_SPACE_OBJECTID -11ULL
 /* dummy objectid represents multiple objectids */
 #define BTRFS_MULTIPLE_OBJECTIDS -255ULL
@@ -265,6 +268,22 @@ struct btrfs_chunk {
        /* additional stripes go here */
 } __attribute__ ((__packed__));
+#define BTRFS_FREE_SPACE_EXTENT 1
+#define BTRFS_FREE_SPACE_BITMAP 2
+struct btrfs_free_space_entry {
+        __le64 offset;
+        __le64 bytes;
+        u8 type;
+} __attribute__ ((__packed__));
+struct btrfs_free_space_header {
+        struct btrfs_disk_key location;
+        __le64 generation;
+        __le64 num_entries;
+        __le64 num_bitmaps;
+} __attribute__ ((__packed__));
 static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 {
        BUG_ON(num_stripes == 0);
@@ -365,8 +384,10 @@ struct btrfs_super_block {
        char label[BTRFS_LABEL_SIZE];
+        __le64 cache_generation;
        /* future expansion */
-        __le64 reserved[32];
+        __le64 reserved[31];
        u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
@@ -375,13 +396,15 @@ struct btrfs_super_block {
 * ones specified below then we will fail to mount
 */
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF    (1ULL << 0)
-#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL   (2ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL   (1ULL << 1)
+#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS     (1ULL << 2)
 #define BTRFS_FEATURE_COMPAT_SUPP               0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP            0ULL
-#define BTRFS_FEATURE_INCOMPAT_SUPP             \
+#define BTRFS_FEATURE_INCOMPAT_SUPP                     \
-        (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+        (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |         \
-         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
+         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |        \
+         BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
 /*
 * A leaf is full of items. offset and size tell us where to find
@@ -675,7 +698,8 @@ struct btrfs_block_group_item {
 struct btrfs_space_info {
        u64 flags;
-        u64 total_bytes;        /* total bytes in the space */
+        u64 total_bytes;        /* total bytes in the space,
+                                   this doesn't take mirrors into account */
        u64 bytes_used;         /* total bytes used,
                                   this does't take mirrors into account */
        u64 bytes_pinned;       /* total bytes pinned, will be freed when the
@@ -687,6 +711,8 @@ struct btrfs_space_info {
        u64 bytes_may_use;      /* number of bytes that may be used for
                                   delalloc/allocations */
        u64 disk_used;          /* total bytes used on disk */
+        u64 disk_total;         /* total bytes on disk, takes mirrors into
+                                   account */
        int full;               /* indicates that we cannot allocate any more
                                   chunks for this space */
@@ -750,6 +776,14 @@ enum btrfs_caching_type {
        BTRFS_CACHE_FINISHED    = 2,
 };
+enum btrfs_disk_cache_state {
+        BTRFS_DC_WRITTEN        = 0,
+        BTRFS_DC_ERROR          = 1,
+        BTRFS_DC_CLEAR          = 2,
+        BTRFS_DC_SETUP          = 3,
+        BTRFS_DC_NEED_WRITE     = 4,
+};
 struct btrfs_caching_control {
        struct list_head list;
        struct mutex mutex;
@@ -763,6 +797,7 @@ struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
        struct btrfs_fs_info *fs_info;
+        struct inode *inode;
        spinlock_t lock;
        u64 pinned;
        u64 reserved;
@@ -773,8 +808,11 @@ struct btrfs_block_group_cache {
        int extents_thresh;
        int free_extents;
        int total_bitmaps;
-        int ro;
+        int ro:1;
-        int dirty;
+        int dirty:1;
+        int iref:1;
+        int disk_cache_state;
        /* cache tracking stuff */
        int cached;
@@ -863,6 +901,7 @@ struct btrfs_fs_info {
        struct btrfs_transaction *running_transaction;
        wait_queue_head_t transaction_throttle;
        wait_queue_head_t transaction_wait;
+        wait_queue_head_t transaction_blocked_wait;
        wait_queue_head_t async_submit_wait;
        struct btrfs_super_block super_copy;
@@ -949,6 +988,7 @@ struct btrfs_fs_info {
        struct btrfs_workers endio_meta_workers;
        struct btrfs_workers endio_meta_write_workers;
        struct btrfs_workers endio_write_workers;
+        struct btrfs_workers endio_freespace_worker;
        struct btrfs_workers submit_workers;
        /*
         * fixup workers take dirty pages that didn't properly go through
@@ -1192,6 +1232,9 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NOSSD               (1 << 9)
 #define BTRFS_MOUNT_DISCARD             (1 << 10)
 #define BTRFS_MOUNT_FORCE_COMPRESS      (1 << 11)
+#define BTRFS_MOUNT_SPACE_CACHE         (1 << 12)
+#define BTRFS_MOUNT_CLEAR_CACHE         (1 << 13)
+#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -1665,6 +1708,27 @@ static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
        write_eb_member(eb, item, struct btrfs_dir_item, location, key);
 }
+BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
+                   num_entries, 64);
+BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
+                   num_bitmaps, 64);
+BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
+                   generation, 64);
+static inline void btrfs_free_space_key(struct extent_buffer *eb,
+                                        struct btrfs_free_space_header *h,
+                                        struct btrfs_disk_key *key)
+{
+        read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
+                                            struct btrfs_free_space_header *h,
+                                            struct btrfs_disk_key *key)
+{
+        write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
 /* struct btrfs_disk_key */
 BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
                         objectid, 64);
@@ -1876,6 +1940,8 @@ BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
                         incompat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
                         csum_type, 16);
+BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
+                         cache_generation, 64);
 static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
 {
@@ -1988,6 +2054,12 @@ static inline struct dentry *fdentry(struct file *file)
        return file->f_path.dentry;
 }
+static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
+{
+        return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+                (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
+}
 /* extent-tree.c */
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
@@ -2079,7 +2151,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
-                                int num_items, int *retries);
+                                int num_items);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2100,7 +2172,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct btrfs_block_rsv *block_rsv,
-                        u64 num_bytes, int *retries);
+                        u64 num_bytes);
 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          struct btrfs_block_rsv *block_rsv,
@@ -2115,6 +2187,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
 int btrfs_set_block_group_rw(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2373,7 +2446,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               u32 min_type);
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
+                                   int sync);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
@@ -2426,6 +2500,10 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root);
 int btrfs_prealloc_file_range(struct inode *inode, int mode,
                              u64 start, u64 num_bytes, u64 min_size,
                              loff_t actual_len, u64 *alloc_hint);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+                                    struct btrfs_trans_handle *trans, int mode,
+                                    u64 start, u64 num_bytes, u64 min_size,
+                                    loff_t actual_len, u64 *alloc_hint);
 extern const struct dentry_operations btrfs_dentry_operations;
 /* ioctl.c */
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index e9103b3baa49..f0cad5ae5be7 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -427,5 +427,5 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
                ret = btrfs_truncate_item(trans, root, path,
                                          item_len - sub_item_len, 1);
        }
-        return 0;
+        return ret;
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 64f10082f048..fb827d0d7181 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -338,7 +338,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        struct extent_io_tree *tree;
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
        u64 found_start;
-        int found_level;
        unsigned long len;
        struct extent_buffer *eb;
        int ret;
@@ -369,8 +368,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
                WARN_ON(1);
                goto err;
        }
-        found_level = btrfs_header_level(eb);
        csum_tree_block(root, eb, 0);
 err:
        free_extent_buffer(eb);
@@ -481,9 +478,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
        end_io_wq->work.flags = 0;
        if (bio->bi_rw & REQ_WRITE) {
-                if (end_io_wq->metadata)
+                if (end_io_wq->metadata == 1)
                        btrfs_queue_worker(&fs_info->endio_meta_write_workers,
                                           &end_io_wq->work);
+                else if (end_io_wq->metadata == 2)
+                        btrfs_queue_worker(&fs_info->endio_freespace_worker,
+                                           &end_io_wq->work);
                else
                        btrfs_queue_worker(&fs_info->endio_write_workers,
                                           &end_io_wq->work);
@@ -497,6 +497,13 @@ static void end_workqueue_bio(struct bio *bio, int err)
        }
 }
+/*
+ * For the metadata arg you want
+ *
+ * 0 - if data
+ * 1 - if normal metadta
+ * 2 - if writing to the free space cache area
+ */
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
                        int metadata)
 {
@@ -533,11 +540,9 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
 static void run_one_async_start(struct btrfs_work *work)
 {
-        struct btrfs_fs_info *fs_info;
        struct async_submit_bio *async;
        async = container_of(work, struct  async_submit_bio, work);
-        fs_info = BTRFS_I(async->inode)->root->fs_info;
        async->submit_bio_start(async->inode, async->rw, async->bio,
                               async->mirror_num, async->bio_flags,
                               async->bio_offset);
@@ -850,12 +855,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
                                      u32 blocksize, u64 parent_transid)
 {
        struct extent_buffer *buf = NULL;
-        struct inode *btree_inode = root->fs_info->btree_inode;
-        struct extent_io_tree *io_tree;
        int ret;
-        io_tree = &BTRFS_I(btree_inode)->io_tree;
        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
        if (!buf)
                return NULL;
@@ -1377,7 +1378,6 @@ static int bio_ready_for_csum(struct bio *bio)
        u64 start = 0;
        struct page *page;
        struct extent_io_tree *io_tree = NULL;
-        struct btrfs_fs_info *info = NULL;
        struct bio_vec *bvec;
        int i;
        int ret;
@@ -1396,7 +1396,6 @@ static int bio_ready_for_csum(struct bio *bio)
                buf_len = page->private >> 2;
                start = page_offset(page) + bvec->bv_offset;
                io_tree = &BTRFS_I(page->mapping->host)->io_tree;
-                info = BTRFS_I(page->mapping->host)->root->fs_info;
        }
        /* are we fully contained in this bio? */
        if (buf_len <= length)
@@ -1680,12 +1679,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        init_waitqueue_head(&fs_info->transaction_throttle);
        init_waitqueue_head(&fs_info->transaction_wait);
+        init_waitqueue_head(&fs_info->transaction_blocked_wait);
        init_waitqueue_head(&fs_info->async_submit_wait);
        __setup_root(4096, 4096, 4096, 4096, tree_root,
                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
        if (!bh)
                goto fail_iput;
@@ -1775,6 +1774,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
                           fs_info->thread_pool_size,
                           &fs_info->generic_worker);
+        btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
+                           1, &fs_info->generic_worker);
        /*
         * endios are largely parallel and should have a very
@@ -1795,6 +1796,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_start_workers(&fs_info->endio_meta_workers, 1);
        btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
        btrfs_start_workers(&fs_info->endio_write_workers, 1);
+        btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1993,6 +1995,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (!(sb->s_flags & MS_RDONLY)) {
                down_read(&fs_info->cleanup_work_sem);
                btrfs_orphan_cleanup(fs_info->fs_root);
+                btrfs_orphan_cleanup(fs_info->tree_root);
                up_read(&fs_info->cleanup_work_sem);
        }
@@ -2035,6 +2038,7 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->endio_meta_workers);
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
+        btrfs_stop_workers(&fs_info->endio_freespace_worker);
        btrfs_stop_workers(&fs_info->submit_workers);
 fail_iput:
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
@@ -2063,7 +2067,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+                if (printk_ratelimit()) {
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
                                       bdevname(bh->b_bdev, b));
@@ -2200,21 +2204,10 @@ static int write_dev_supers(struct btrfs_device *device,
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
-                if (i == last_barrier && do_barriers && device->barriers) {
+                if (i == last_barrier && do_barriers)
-                        ret = submit_bh(WRITE_BARRIER, bh);
+                        ret = submit_bh(WRITE_FLUSH_FUA, bh);
-                        if (ret == -EOPNOTSUPP) {
+                else
-                                printk("btrfs: disabling barriers on dev %s\n",
-                                       device->name);
-                                set_buffer_uptodate(bh);
-                                device->barriers = 0;
-                                /* one reference for submit_bh */
-                                get_bh(bh);
-                                lock_buffer(bh);
-                                ret = submit_bh(WRITE_SYNC, bh);
-                        }
-                } else {
                        ret = submit_bh(WRITE_SYNC, bh);
-                }
                if (ret)
                        errors++;
@@ -2421,6 +2414,7 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
+        btrfs_put_block_group_cache(fs_info);
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret =  btrfs_commit_super(root);
                if (ret)
@@ -2467,6 +2461,7 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->endio_meta_workers);
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
+        btrfs_stop_workers(&fs_info->endio_freespace_worker);
        btrfs_stop_workers(&fs_info->submit_workers);
        btrfs_close_devices(fs_info->fs_devices);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32d094002a57..0c097f3aec41 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -242,6 +242,12 @@ get_caching_control(struct btrfs_block_group_cache *cache)
                return NULL;
        }
+        /* We're loading it the fast way, so we don't have a caching_ctl. */
+        if (!cache->caching_ctl) {
+                spin_unlock(&cache->lock);
+                return NULL;
+        }
        ctl = cache->caching_ctl;
        atomic_inc(&ctl->count);
        spin_unlock(&cache->lock);
@@ -421,7 +427,9 @@ err:
        return 0;
 }
-static int cache_block_group(struct btrfs_block_group_cache *cache)
+static int cache_block_group(struct btrfs_block_group_cache *cache,
+                             struct btrfs_trans_handle *trans,
+                             int load_cache_only)
 {
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_caching_control *caching_ctl;
@@ -432,6 +440,36 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
        if (cache->cached != BTRFS_CACHE_NO)
                return 0;
+        /*
+         * We can't do the read from on-disk cache during a commit since we need
+         * to have the normal tree locking.
+         */
+        if (!trans->transaction->in_commit) {
+                spin_lock(&cache->lock);
+                if (cache->cached != BTRFS_CACHE_NO) {
+                        spin_unlock(&cache->lock);
+                        return 0;
+                }
+                cache->cached = BTRFS_CACHE_STARTED;
+                spin_unlock(&cache->lock);
+                ret = load_free_space_cache(fs_info, cache);
+                spin_lock(&cache->lock);
+                if (ret == 1) {
+                        cache->cached = BTRFS_CACHE_FINISHED;
+                        cache->last_byte_to_unpin = (u64)-1;
+                } else {
+                        cache->cached = BTRFS_CACHE_NO;
+                }
+                spin_unlock(&cache->lock);
+                if (ret == 1)
+                        return 0;
+        }
+        if (load_cache_only)
+                return 0;
        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
        BUG_ON(!caching_ctl);
@@ -509,7 +547,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
-                if (found->flags == flags) {
+                if (found->flags & flags) {
                        rcu_read_unlock();
                        return found;
                }
@@ -542,6 +580,15 @@ static u64 div_factor(u64 num, int factor)
        return num;
 }
+static u64 div_factor_fine(u64 num, int factor)
+{
+        if (factor == 100)
+                return num;
+        num *= factor;
+        do_div(num, 100);
+        return num;
+}
 u64 btrfs_find_block_group(struct btrfs_root *root,
                           u64 search_start, u64 search_hint, int owner)
 {
@@ -1695,8 +1742,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
 {
-        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
-                        BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -2688,6 +2734,109 @@ next_block_group(struct btrfs_root *root,
        return cache;
 }
+static int cache_save_setup(struct btrfs_block_group_cache *block_group,
+                            struct btrfs_trans_handle *trans,
+                            struct btrfs_path *path)
+{
+        struct btrfs_root *root = block_group->fs_info->tree_root;
+        struct inode *inode = NULL;
+        u64 alloc_hint = 0;
+        int num_pages = 0;
+        int retries = 0;
+        int ret = 0;
+        /*
+         * If this block group is smaller than 100 megs don't bother caching the
+         * block group.
+         */
+        if (block_group->key.offset < (100 * 1024 * 1024)) {
+                spin_lock(&block_group->lock);
+                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+                spin_unlock(&block_group->lock);
+                return 0;
+        }
+again:
+        inode = lookup_free_space_inode(root, block_group, path);
+        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+                ret = PTR_ERR(inode);
+                btrfs_release_path(root, path);
+                goto out;
+        }
+        if (IS_ERR(inode)) {
+                BUG_ON(retries);
+                retries++;
+                if (block_group->ro)
+                        goto out_free;
+                ret = create_free_space_inode(root, trans, block_group, path);
+                if (ret)
+                        goto out_free;
+                goto again;
+        }
+        /*
+         * We want to set the generation to 0, that way if anything goes wrong
+         * from here on out we know not to trust this cache when we load up next
+         * time.
+         */
+        BTRFS_I(inode)->generation = 0;
+        ret = btrfs_update_inode(trans, root, inode);
+        WARN_ON(ret);
+        if (i_size_read(inode) > 0) {
+                ret = btrfs_truncate_free_space_cache(root, trans, path,
+                                                      inode);
+                if (ret)
+                        goto out_put;
+        }
+        spin_lock(&block_group->lock);
+        if (block_group->cached != BTRFS_CACHE_FINISHED) {
+                spin_unlock(&block_group->lock);
+                goto out_put;
+        }
+        spin_unlock(&block_group->lock);
+        num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+        if (!num_pages)
+                num_pages = 1;
+        /*
+         * Just to make absolutely sure we have enough space, we're going to
+         * preallocate 12 pages worth of space for each block group.  In
+         * practice we ought to use at most 8, but we need extra space so we can
+         * add our header and have a terminator between the extents and the
+         * bitmaps.
+         */
+        num_pages *= 16;
+        num_pages *= PAGE_CACHE_SIZE;
+        ret = btrfs_check_data_free_space(inode, num_pages);
+        if (ret)
+                goto out_put;
+        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
+                                              num_pages, num_pages,
+                                              &alloc_hint);
+        btrfs_free_reserved_data_space(inode, num_pages);
+out_put:
+        iput(inode);
+out_free:
+        btrfs_release_path(root, path);
+out:
+        spin_lock(&block_group->lock);
+        if (ret)
+                block_group->disk_cache_state = BTRFS_DC_ERROR;
+        else
+                block_group->disk_cache_state = BTRFS_DC_SETUP;
+        spin_unlock(&block_group->lock);
+        return ret;
+}
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
@@ -2700,6 +2849,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
+again:
+        while (1) {
+                cache = btrfs_lookup_first_block_group(root->fs_info, last);
+                while (cache) {
+                        if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+                                break;
+                        cache = next_block_group(root, cache);
+                }
+                if (!cache) {
+                        if (last == 0)
+                                break;
+                        last = 0;
+                        continue;
+                }
+                err = cache_save_setup(cache, trans, path);
+                last = cache->key.objectid + cache->key.offset;
+                btrfs_put_block_group(cache);
+        }
        while (1) {
                if (last == 0) {
                        err = btrfs_run_delayed_refs(trans, root,
@@ -2709,6 +2877,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                cache = btrfs_lookup_first_block_group(root->fs_info, last);
                while (cache) {
+                        if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
+                                btrfs_put_block_group(cache);
+                                goto again;
+                        }
                        if (cache->dirty)
                                break;
                        cache = next_block_group(root, cache);
@@ -2720,6 +2893,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                        continue;
                }
+                if (cache->disk_cache_state == BTRFS_DC_SETUP)
+                        cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
                cache->dirty = 0;
                last = cache->key.objectid + cache->key.offset;
@@ -2728,6 +2903,52 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                btrfs_put_block_group(cache);
        }
+        while (1) {
+                /*
+                 * I don't think this is needed since we're just marking our
+                 * preallocated extent as written, but just in case it can't
+                 * hurt.
+                 */
+                if (last == 0) {
+                        err = btrfs_run_delayed_refs(trans, root,
+                                                     (unsigned long)-1);
+                        BUG_ON(err);
+                }
+                cache = btrfs_lookup_first_block_group(root->fs_info, last);
+                while (cache) {
+                        /*
+                         * Really this shouldn't happen, but it could if we
+                         * couldn't write the entire preallocated extent and
+                         * splitting the extent resulted in a new block.
+                         */
+                        if (cache->dirty) {
+                                btrfs_put_block_group(cache);
+                                goto again;
+                        }
+                        if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+                                break;
+                        cache = next_block_group(root, cache);
+                }
+                if (!cache) {
+                        if (last == 0)
+                                break;
+                        last = 0;
+                        continue;
+                }
+                btrfs_write_out_cache(root, trans, cache, path);
+                /*
+                 * If we didn't have an error then the cache state is still
+                 * NEED_WRITE, so we can set it to WRITTEN.
+                 */
+                if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+                        cache->disk_cache_state = BTRFS_DC_WRITTEN;
+                last = cache->key.objectid + cache->key.offset;
+                btrfs_put_block_group(cache);
+        }
        btrfs_free_path(path);
        return 0;
 }
@@ -2763,6 +2984,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        if (found) {
                spin_lock(&found->lock);
                found->total_bytes += total_bytes;
+                found->disk_total += total_bytes * factor;
                found->bytes_used += bytes_used;
                found->disk_used += bytes_used * factor;
                found->full = 0;
@@ -2782,6 +3004,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                                BTRFS_BLOCK_GROUP_SYSTEM |
                                BTRFS_BLOCK_GROUP_METADATA);
        found->total_bytes = total_bytes;
+        found->disk_total = total_bytes * factor;
        found->bytes_used = bytes_used;
        found->disk_used = bytes_used * factor;
        found->bytes_pinned = 0;
@@ -2883,11 +3106,16 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
        struct btrfs_space_info *data_sinfo;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 used;
-        int ret = 0, committed = 0;
+        int ret = 0, committed = 0, alloc_chunk = 1;
        /* make sure bytes are sectorsize aligned */
        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+        if (root == root->fs_info->tree_root) {
+                alloc_chunk = 0;
+                committed = 1;
+        }
        data_sinfo = BTRFS_I(inode)->space_info;
        if (!data_sinfo)
                goto alloc;
@@ -2906,7 +3134,7 @@ again:
                 * if we don't have enough free bytes in this space then we need
                 * to alloc a new chunk.
                 */
-                if (!data_sinfo->full) {
+                if (!data_sinfo->full && alloc_chunk) {
                        u64 alloc_target;
                        data_sinfo->force_alloc = 1;
@@ -2998,10 +3226,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
        rcu_read_unlock();
 }
-static int should_alloc_chunk(struct btrfs_space_info *sinfo,
+static int should_alloc_chunk(struct btrfs_root *root,
-                              u64 alloc_bytes)
+                              struct btrfs_space_info *sinfo, u64 alloc_bytes)
 {
        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+        u64 thresh;
        if (sinfo->bytes_used + sinfo->bytes_reserved +
            alloc_bytes + 256 * 1024 * 1024 < num_bytes)
@@ -3011,6 +3240,12 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo,
            alloc_bytes < div_factor(num_bytes, 8))
                return 0;
+        thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+        thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+        if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+                return 0;
        return 1;
 }
@@ -3042,13 +3277,21 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                goto out;
        }
-        if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
+        if (!force && !should_alloc_chunk(extent_root, space_info,
+                                          alloc_bytes)) {
                spin_unlock(&space_info->lock);
                goto out;
        }
        spin_unlock(&space_info->lock);
        /*
+         * If we have mixed data/metadata chunks we want to make sure we keep
+         * allocating mixed chunks instead of individual chunks.
+         */
+        if (btrfs_mixed_space_info(space_info))
+                flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
+        /*
         * if we're doing a data chunk, go ahead and make sure that
         * we keep a reasonable number of metadata chunks allocated in the
         * FS as well.
@@ -3073,55 +3316,25 @@ out:
        return ret;
 }
-static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
-                                struct btrfs_space_info *sinfo, u64 num_bytes)
-{
-        int ret;
-        int end_trans = 0;
-        if (sinfo->full)
-                return 0;
-        spin_lock(&sinfo->lock);
-        ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
-        spin_unlock(&sinfo->lock);
-        if (!ret)
-                return 0;
-        if (!trans) {
-                trans = btrfs_join_transaction(root, 1);
-                BUG_ON(IS_ERR(trans));
-                end_trans = 1;
-        }
-        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                             num_bytes + 2 * 1024 * 1024,
-                             get_alloc_profile(root, sinfo->flags), 0);
-        if (end_trans)
-                btrfs_end_transaction(trans, root);
-        return ret == 1 ? 1 : 0;
-}
 /*
 * shrink metadata reservation for delalloc
 */
 static int shrink_delalloc(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root, u64 to_reclaim)
+                           struct btrfs_root *root, u64 to_reclaim, int sync)
 {
        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_space_info *space_info;
        u64 reserved;
        u64 max_reclaim;
        u64 reclaimed = 0;
        int pause = 1;
-        int ret;
+        int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
        block_rsv = &root->fs_info->delalloc_block_rsv;
-        spin_lock(&block_rsv->lock);
+        space_info = block_rsv->space_info;
-        reserved = block_rsv->reserved;
-        spin_unlock(&block_rsv->lock);
+        smp_mb();
+        reserved = space_info->bytes_reserved;
        if (reserved == 0)
                return 0;
@@ -3129,104 +3342,169 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
        max_reclaim = min(reserved, to_reclaim);
        while (1) {
-                ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+                /* have the flusher threads jump in and do some IO */
-                if (!ret) {
+                smp_mb();
-                        __set_current_state(TASK_INTERRUPTIBLE);
+                nr_pages = min_t(unsigned long, nr_pages,
-                        schedule_timeout(pause);
+                       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
-                        pause <<= 1;
+                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
-                        if (pause > HZ / 10)
-                                pause = HZ / 10;
-                } else {
-                        pause = 1;
-                }
-                spin_lock(&block_rsv->lock);
+                spin_lock(&space_info->lock);
-                if (reserved > block_rsv->reserved)
+                if (reserved > space_info->bytes_reserved)
-                        reclaimed = reserved - block_rsv->reserved;
+                        reclaimed += reserved - space_info->bytes_reserved;
-                reserved = block_rsv->reserved;
+                reserved = space_info->bytes_reserved;
-                spin_unlock(&block_rsv->lock);
+                spin_unlock(&space_info->lock);
                if (reserved == 0 || reclaimed >= max_reclaim)
                        break;
                if (trans && trans->transaction->blocked)
                        return -EAGAIN;
+                __set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(pause);
+                pause <<= 1;
+                if (pause > HZ / 10)
+                        pause = HZ / 10;
        }
        return reclaimed >= to_reclaim;
 }
-static int should_retry_reserve(struct btrfs_trans_handle *trans,
+/*
-                                struct btrfs_root *root,
+ * Retries tells us how many times we've called reserve_metadata_bytes.  The
-                                struct btrfs_block_rsv *block_rsv,
+ * idea is if this is the first call (retries == 0) then we will add to our
-                                u64 num_bytes, int *retries)
+ * reserved count if we can't make the allocation in order to hold our place
+ * while we go and try and free up space.  That way for retries > 1 we don't try
+ * and add space, we just check to see if the amount of unused space is >= the
+ * total space, meaning that our reservation is valid.
+ *
+ * However if we don't intend to retry this reservation, pass -1 as retries so
+ * that it short circuits this logic.
+ */
+static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  struct btrfs_block_rsv *block_rsv,
+                                  u64 orig_bytes, int flush)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
-        int ret;
+        u64 unused;
+        u64 num_bytes = orig_bytes;
+        int retries = 0;
+        int ret = 0;
+        bool reserved = false;
+        bool committed = false;
-        if ((*retries) > 2)
+again:
-                return -ENOSPC;
+        ret = -ENOSPC;
+        if (reserved)
+                num_bytes = 0;
-        ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
+        spin_lock(&space_info->lock);
-        if (ret)
+        unused = space_info->bytes_used + space_info->bytes_reserved +
-                return 1;
+                 space_info->bytes_pinned + space_info->bytes_readonly +
+                 space_info->bytes_may_use;
-        if (trans && trans->transaction->in_commit)
+        /*
-                return -ENOSPC;
+         * The idea here is that we've not already over-reserved the block group
+         * then we can go ahead and save our reservation first and then start
+         * flushing if we need to.  Otherwise if we've already overcommitted
+         * lets start flushing stuff first and then come back and try to make
+         * our reservation.
+         */
+        if (unused <= space_info->total_bytes) {
+                unused -= space_info->total_bytes;
+                if (unused >= num_bytes) {
+                        if (!reserved)
+                                space_info->bytes_reserved += orig_bytes;
+                        ret = 0;
+                } else {
+                        /*
+                         * Ok set num_bytes to orig_bytes since we aren't
+                         * overocmmitted, this way we only try and reclaim what
+                         * we need.
+                         */
+                        num_bytes = orig_bytes;
+                }
+        } else {
+                /*
+                 * Ok we're over committed, set num_bytes to the overcommitted
+                 * amount plus the amount of bytes that we need for this
+                 * reservation.
+                 */
+                num_bytes = unused - space_info->total_bytes +
+                        (orig_bytes * (retries + 1));
+        }
-        ret = shrink_delalloc(trans, root, num_bytes);
+        /*
-        if (ret)
+         * Couldn't make our reservation, save our place so while we're trying
-                return ret;
+         * to reclaim space we can actually use it instead of somebody else
+         * stealing it from us.
+         */
+        if (ret && !reserved) {
+                space_info->bytes_reserved += orig_bytes;
+                reserved = true;
+        }
-        spin_lock(&space_info->lock);
-        if (space_info->bytes_pinned < num_bytes)
-                ret = 1;
        spin_unlock(&space_info->lock);
-        if (ret)
-                return -ENOSPC;
-        (*retries)++;
-        if (trans)
+        if (!ret)
-                return -EAGAIN;
+                return 0;
-        trans = btrfs_join_transaction(root, 1);
+        if (!flush)
-        BUG_ON(IS_ERR(trans));
+                goto out;
-        ret = btrfs_commit_transaction(trans, root);
-        BUG_ON(ret);
-        return 1;
+        /*
-}
+         * We do synchronous shrinking since we don't actually unreserve
+         * metadata until after the IO is completed.
+         */
+        ret = shrink_delalloc(trans, root, num_bytes, 1);
+        if (ret > 0)
+                return 0;
+        else if (ret < 0)
+                goto out;
-static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
+        /*
-                                  u64 num_bytes)
+         * So if we were overcommitted it's possible that somebody else flushed
-{
+         * out enough space and we simply didn't have enough space to reclaim,
-        struct btrfs_space_info *space_info = block_rsv->space_info;
+         * so go back around and try again.
-        u64 unused;
+         */
-        int ret = -ENOSPC;
+        if (retries < 2) {
+                retries++;
+                goto again;
+        }
        spin_lock(&space_info->lock);
-        unused = space_info->bytes_used + space_info->bytes_reserved +
+        /*
-                 space_info->bytes_pinned + space_info->bytes_readonly;
+         * Not enough space to be reclaimed, don't bother committing the
+         * transaction.
+         */
+        if (space_info->bytes_pinned < orig_bytes)
+                ret = -ENOSPC;
+        spin_unlock(&space_info->lock);
+        if (ret)
+                goto out;
-        if (unused < space_info->total_bytes)
+        ret = -EAGAIN;
-                unused = space_info->total_bytes - unused;
+        if (trans || committed)
-        else
+                goto out;
-                unused = 0;
-        if (unused >= num_bytes) {
+        ret = -ENOSPC;
-                if (block_rsv->priority >= 10) {
+        trans = btrfs_join_transaction(root, 1);
-                        space_info->bytes_reserved += num_bytes;
+        if (IS_ERR(trans))
-                        ret = 0;
+                goto out;
-                } else {
+        ret = btrfs_commit_transaction(trans, root);
-                        if ((unused + block_rsv->reserved) *
+        if (!ret) {
-                            block_rsv->priority >=
+                trans = NULL;
-                            (num_bytes + block_rsv->reserved) * 10) {
+                committed = true;
-                                space_info->bytes_reserved += num_bytes;
+                goto again;
-                                ret = 0;
+        }
-                        }
-                }
+out:
+        if (reserved) {
+                spin_lock(&space_info->lock);
+                space_info->bytes_reserved -= orig_bytes;
+                spin_unlock(&space_info->lock);
        }
-        spin_unlock(&space_info->lock);
        return ret;
 }
@@ -3328,18 +3606,14 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 {
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_fs_info *fs_info = root->fs_info;
-        u64 alloc_target;
        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
        if (!block_rsv)
                return NULL;
        btrfs_init_block_rsv(block_rsv);
-        alloc_target = btrfs_get_alloc_profile(root, 0);
        block_rsv->space_info = __find_space_info(fs_info,
                                                  BTRFS_BLOCK_GROUP_METADATA);
        return block_rsv;
 }
@@ -3370,23 +3644,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct btrfs_block_rsv *block_rsv,
-                        u64 num_bytes, int *retries)
+                        u64 num_bytes)
 {
        int ret;
        if (num_bytes == 0)
                return 0;
-again:
-        ret = reserve_metadata_bytes(block_rsv, num_bytes);
+        ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, 1);
                return 0;
        }
-        ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
-        if (ret > 0)
-                goto again;
        return ret;
 }
@@ -3421,7 +3691,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
                return 0;
        if (block_rsv->refill_used) {
-                ret = reserve_metadata_bytes(block_rsv, num_bytes);
+                ret = reserve_metadata_bytes(trans, root, block_rsv,
+                                             num_bytes, 0);
                if (!ret) {
                        block_rsv_add_bytes(block_rsv, num_bytes, 0);
                        return 0;
@@ -3500,6 +3771,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
        spin_lock(&sinfo->lock);
+        if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
+                data_used = 0;
        meta_used = sinfo->bytes_used;
        spin_unlock(&sinfo->lock);
@@ -3527,7 +3800,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
        block_rsv->size = num_bytes;
        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
-                    sinfo->bytes_reserved + sinfo->bytes_readonly;
+                    sinfo->bytes_reserved + sinfo->bytes_readonly +
+                    sinfo->bytes_may_use;
        if (sinfo->total_bytes > num_bytes) {
                num_bytes = sinfo->total_bytes - num_bytes;
@@ -3598,7 +3872,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
-                                 int num_items, int *retries)
+                                 int num_items)
 {
        u64 num_bytes;
        int ret;
@@ -3608,7 +3882,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
        num_bytes = calc_trans_metadata_size(root, num_items);
        ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
-                                  num_bytes, retries);
+                                  num_bytes);
        if (!ret) {
                trans->bytes_reserved += num_bytes;
                trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3682,14 +3956,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
        u64 to_reserve;
        int nr_extents;
-        int retries = 0;
        int ret;
        if (btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
        num_bytes = ALIGN(num_bytes, root->sectorsize);
-again:
        spin_lock(&BTRFS_I(inode)->accounting_lock);
        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
        if (nr_extents > BTRFS_I(inode)->reserved_extents) {
@@ -3699,18 +3972,14 @@ again:
                nr_extents = 0;
                to_reserve = 0;
        }
+        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        to_reserve += calc_csum_metadata_size(inode, num_bytes);
-        ret = reserve_metadata_bytes(block_rsv, to_reserve);
+        ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
-        if (ret) {
+        if (ret)
-                spin_unlock(&BTRFS_I(inode)->accounting_lock);
-                ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
-                                           &retries);
-                if (ret > 0)
-                        goto again;
                return ret;
-        }
+        spin_lock(&BTRFS_I(inode)->accounting_lock);
        BTRFS_I(inode)->reserved_extents += nr_extents;
        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
        spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -3718,7 +3987,7 @@ again:
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
        if (block_rsv->size > 512 * 1024 * 1024)
-                shrink_delalloc(NULL, root, to_reserve);
+                shrink_delalloc(NULL, root, to_reserve, 0);
        return 0;
 }
@@ -3777,12 +4046,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc)
 {
-        struct btrfs_block_group_cache *cache;
+        struct btrfs_block_group_cache *cache = NULL;
        struct btrfs_fs_info *info = root->fs_info;
-        int factor;
        u64 total = num_bytes;
        u64 old_val;
        u64 byte_in_group;
+        int factor;
        /* block accounting for super block */
        spin_lock(&info->delalloc_lock);
@@ -3804,11 +4073,25 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        factor = 2;
                else
                        factor = 1;
+                /*
+                 * If this block group has free space cache written out, we
+                 * need to make sure to load it if we are removing space.  This
+                 * is because we need the unpinning stage to actually add the
+                 * space back to the block group, otherwise we will leak space.
+                 */
+                if (!alloc && cache->cached == BTRFS_CACHE_NO)
+                        cache_block_group(cache, trans, 1);
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
                spin_lock(&cache->space_info->lock);
                spin_lock(&cache->lock);
+                if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+                    cache->disk_cache_state < BTRFS_DC_CLEAR)
+                        cache->disk_cache_state = BTRFS_DC_CLEAR;
                cache->dirty = 1;
                old_val = btrfs_block_group_used(&cache->item);
                num_bytes = min(total, cache->key.offset - byte_in_group);
@@ -4555,6 +4838,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
+        bool use_cluster = true;
        u64 ideal_cache_percent = 0;
        u64 ideal_cache_offset = 0;
@@ -4569,16 +4853,24 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                return -ENOSPC;
        }
+        /*
+         * If the space info is for both data and metadata it means we have a
+         * small filesystem and we can't use the clustering stuff.
+         */
+        if (btrfs_mixed_space_info(space_info))
+                use_cluster = false;
        if (orig_root->ref_cows || empty_size)
                allowed_chunk_alloc = 1;
-        if (data & BTRFS_BLOCK_GROUP_METADATA) {
+        if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
                last_ptr = &root->fs_info->meta_alloc_cluster;
                if (!btrfs_test_opt(root, SSD))
                        empty_cluster = 64 * 1024;
        }
-        if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+        if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+            btrfs_test_opt(root, SSD)) {
                last_ptr = &root->fs_info->data_alloc_cluster;
        }
@@ -4642,6 +4934,10 @@ have_block_group:
                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
                        u64 free_percent;
+                        ret = cache_block_group(block_group, trans, 1);
+                        if (block_group->cached == BTRFS_CACHE_FINISHED)
+                                goto have_block_group;
                        free_percent = btrfs_block_group_used(&block_group->item);
                        free_percent *= 100;
                        free_percent = div64_u64(free_percent,
@@ -4662,7 +4958,7 @@ have_block_group:
                        if (loop > LOOP_CACHING_NOWAIT ||
                            (loop > LOOP_FIND_IDEAL &&
                             atomic_read(&space_info->caching_threads) < 2)) {
-                                ret = cache_block_group(block_group);
+                                ret = cache_block_group(block_group, trans, 0);
                                BUG_ON(ret);
                        }
                        found_uncached_bg = true;
@@ -5219,7 +5515,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        u64 num_bytes = ins->offset;
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-        cache_block_group(block_group);
+        cache_block_group(block_group, trans, 0);
        caching_ctl = get_caching_control(block_group);
        if (!caching_ctl) {
@@ -5309,7 +5605,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        block_rsv = get_block_rsv(trans, root);
        if (block_rsv->size == 0) {
-                ret = reserve_metadata_bytes(block_rsv, blocksize);
+                ret = reserve_metadata_bytes(trans, root, block_rsv,
+                                             blocksize, 0);
                if (ret)
                        return ERR_PTR(ret);
                return block_rsv;
@@ -5319,11 +5616,6 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        if (!ret)
                return block_rsv;
-        WARN_ON(1);
-        printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
-                block_rsv->size, block_rsv->reserved,
-                block_rsv->freed[0], block_rsv->freed[1]);
        return ERR_PTR(-ENOSPC);
 }
@@ -5422,7 +5714,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
        u64 generation;
        u64 refs;
        u64 flags;
-        u64 last = 0;
        u32 nritems;
        u32 blocksize;
        struct btrfs_key key;
@@ -5490,7 +5781,6 @@ reada:
                                           generation);
                if (ret)
                        break;
-                last = bytenr + blocksize;
                nread++;
        }
        wc->reada_slot = slot;
@@ -7814,6 +8104,40 @@ out:
        return ret;
 }
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
+{
+        struct btrfs_block_group_cache *block_group;
+        u64 last = 0;
+        while (1) {
+                struct inode *inode;
+                block_group = btrfs_lookup_first_block_group(info, last);
+                while (block_group) {
+                        spin_lock(&block_group->lock);
+                        if (block_group->iref)
+                                break;
+                        spin_unlock(&block_group->lock);
+                        block_group = next_block_group(info->tree_root,
+                                                       block_group);
+                }
+                if (!block_group) {
+                        if (last == 0)
+                                break;
+                        last = 0;
+                        continue;
+                }
+                inode = block_group->inode;
+                block_group->iref = 0;
+                block_group->inode = NULL;
+                spin_unlock(&block_group->lock);
+                iput(inode);
+                last = block_group->key.objectid + block_group->key.offset;
+                btrfs_put_block_group(block_group);
+        }
+}
 int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
        struct btrfs_block_group_cache *block_group;
@@ -7897,6 +8221,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct extent_buffer *leaf;
+        int need_clear = 0;
+        u64 cache_gen;
        root = info->extent_root;
        key.objectid = 0;
@@ -7906,6 +8232,15 @@ int btrfs_read_block_groups(struct btrfs_root *root)
        if (!path)
                return -ENOMEM;
+        cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+        if (cache_gen != 0 &&
+            btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+                need_clear = 1;
+        if (btrfs_test_opt(root, CLEAR_CACHE))
+                need_clear = 1;
+        if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
+                printk(KERN_INFO "btrfs: disk space caching is enabled\n");
        while (1) {
                ret = find_first_block_group(root, path, &key);
                if (ret > 0)
@@ -7928,6 +8263,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
+                if (need_clear)
+                        cache->disk_cache_state = BTRFS_DC_CLEAR;
                /*
                 * we only want to have 32k of ram per block group for keeping
                 * track of free space, and if we pass 1/2 of that we want to
@@ -8032,6 +8370,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.offset = size;
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        cache->sectorsize = root->sectorsize;
+        cache->fs_info = root->fs_info;
        /*
         * we only want to have 32k of ram per block group for keeping track
@@ -8088,8 +8427,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        struct btrfs_path *path;
        struct btrfs_block_group_cache *block_group;
        struct btrfs_free_cluster *cluster;
+        struct btrfs_root *tree_root = root->fs_info->tree_root;
        struct btrfs_key key;
+        struct inode *inode;
        int ret;
+        int factor;
        root = root->fs_info->extent_root;
@@ -8098,6 +8440,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        BUG_ON(!block_group->ro);
        memcpy(&key, &block_group->key, sizeof(key));
+        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
+                                  BTRFS_BLOCK_GROUP_RAID1 |
+                                  BTRFS_BLOCK_GROUP_RAID10))
+                factor = 2;
+        else
+                factor = 1;
        /* make sure this block group isn't part of an allocation cluster */
        cluster = &root->fs_info->data_alloc_cluster;
@@ -8117,6 +8465,40 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        inode = lookup_free_space_inode(root, block_group, path);
+        if (!IS_ERR(inode)) {
+                btrfs_orphan_add(trans, inode);
+                clear_nlink(inode);
+                /* One for the block groups ref */
+                spin_lock(&block_group->lock);
+                if (block_group->iref) {
+                        block_group->iref = 0;
+                        block_group->inode = NULL;
+                        spin_unlock(&block_group->lock);
+                        iput(inode);
+                } else {
+                        spin_unlock(&block_group->lock);
+                }
+                /* One for our lookup ref */
+                iput(inode);
+        }
+        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+        key.offset = block_group->key.objectid;
+        key.type = 0;
+        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+        if (ret < 0)
+                goto out;
+        if (ret > 0)
+                btrfs_release_path(tree_root, path);
+        if (ret == 0) {
+                ret = btrfs_del_item(trans, tree_root, path);
+                if (ret)
+                        goto out;
+                btrfs_release_path(tree_root, path);
+        }
        spin_lock(&root->fs_info->block_group_cache_lock);
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
@@ -8138,8 +8520,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        spin_lock(&block_group->space_info->lock);
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
+        block_group->space_info->disk_total -= block_group->key.offset * factor;
        spin_unlock(&block_group->space_info->lock);
+        memcpy(&key, &block_group->key, sizeof(key));
        btrfs_clear_space_info_full(root->fs_info);
        btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d74e6af9b53a..eac10e3260a9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -104,7 +104,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
                          struct address_space *mapping, gfp_t mask)
 {
        tree->state = RB_ROOT;
-        tree->buffer = RB_ROOT;
+        INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
        tree->ops = NULL;
        tree->dirty_bytes = 0;
        spin_lock_init(&tree->lock);
@@ -235,50 +235,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
        return ret;
 }
-static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
-                                          u64 offset, struct rb_node *node)
-{
-        struct rb_root *root = &tree->buffer;
-        struct rb_node **p = &root->rb_node;
-        struct rb_node *parent = NULL;
-        struct extent_buffer *eb;
-        while (*p) {
-                parent = *p;
-                eb = rb_entry(parent, struct extent_buffer, rb_node);
-                if (offset < eb->start)
-                        p = &(*p)->rb_left;
-                else if (offset > eb->start)
-                        p = &(*p)->rb_right;
-                else
-                        return eb;
-        }
-        rb_link_node(node, parent, p);
-        rb_insert_color(node, root);
-        return NULL;
-}
-static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
-                                           u64 offset)
-{
-        struct rb_root *root = &tree->buffer;
-        struct rb_node *n = root->rb_node;
-        struct extent_buffer *eb;
-        while (n) {
-                eb = rb_entry(n, struct extent_buffer, rb_node);
-                if (offset < eb->start)
-                        n = n->rb_left;
-                else if (offset > eb->start)
-                        n = n->rb_right;
-                else
-                        return eb;
-        }
-        return NULL;
-}
 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
                     struct extent_state *other)
 {
@@ -1901,10 +1857,8 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
        struct page *page = bvec->bv_page;
        struct extent_io_tree *tree = bio->bi_private;
        u64 start;
-        u64 end;
        start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
-        end = start + bvec->bv_len - 1;
        bio->bi_private = NULL;
@@ -2204,7 +2158,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        u64 last_byte = i_size_read(inode);
        u64 block_start;
        u64 iosize;
-        u64 unlock_start;
        sector_t sector;
        struct extent_state *cached_state = NULL;
        struct extent_map *em;
@@ -2329,7 +2282,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                if (tree->ops && tree->ops->writepage_end_io_hook)
                        tree->ops->writepage_end_io_hook(page, start,
                                                         page_end, NULL, 1);
-                unlock_start = page_end + 1;
                goto done;
        }
@@ -2340,7 +2292,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                        if (tree->ops && tree->ops->writepage_end_io_hook)
                                tree->ops->writepage_end_io_hook(page, cur,
                                                         page_end, NULL, 1);
-                        unlock_start = page_end + 1;
                        break;
                }
                em = epd->get_extent(inode, page, pg_offset, cur,
@@ -2387,7 +2338,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                        cur += iosize;
                        pg_offset += iosize;
-                        unlock_start = cur;
                        continue;
                }
                /* leave this out until we have a page_mkwrite call */
@@ -2473,7 +2423,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        int scanned = 0;
-        int range_whole = 0;
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
@@ -2482,8 +2431,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
                end = wbc->range_end >> PAGE_CACHE_SHIFT;
-                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-                        range_whole = 1;
                scanned = 1;
        }
 retry:
@@ -2823,6 +2770,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
                                         NULL, 1,
                                         end_bio_extent_preparewrite, 0,
                                         0, 0);
+                        if (ret && !err)
+                                err = ret;
                        iocount++;
                        block_start = block_start + iosize;
                } else {
@@ -3104,6 +3053,39 @@ static void __free_extent_buffer(struct extent_buffer *eb)
        kmem_cache_free(extent_buffer_cache, eb);
 }
+/*
+ * Helper for releasing extent buffer page.
+ */
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
+                                                unsigned long start_idx)
+{
+        unsigned long index;
+        struct page *page;
+        if (!eb->first_page)
+                return;
+        index = num_extent_pages(eb->start, eb->len);
+        if (start_idx >= index)
+                return;
+        do {
+                index--;
+                page = extent_buffer_page(eb, index);
+                if (page)
+                        page_cache_release(page);
+        } while (index != start_idx);
+}
+/*
+ * Helper for releasing the extent buffer.
+ */
+static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
+{
+        btrfs_release_extent_buffer_page(eb, 0);
+        __free_extent_buffer(eb);
+}
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                                          u64 start, unsigned long len,
                                          struct page *page0,
@@ -3117,16 +3099,16 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        struct page *p;
        struct address_space *mapping = tree->mapping;
        int uptodate = 1;
+        int ret;
-        spin_lock(&tree->buffer_lock);
+        rcu_read_lock();
-        eb = buffer_search(tree, start);
+        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-        if (eb) {
+        if (eb && atomic_inc_not_zero(&eb->refs)) {
-                atomic_inc(&eb->refs);
+                rcu_read_unlock();
-                spin_unlock(&tree->buffer_lock);
                mark_page_accessed(eb->first_page);
                return eb;
        }
-        spin_unlock(&tree->buffer_lock);
+        rcu_read_unlock();
        eb = __alloc_extent_buffer(tree, start, len, mask);
        if (!eb)
@@ -3165,26 +3147,31 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        if (uptodate)
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+        if (ret)
+                goto free_eb;
        spin_lock(&tree->buffer_lock);
-        exists = buffer_tree_insert(tree, start, &eb->rb_node);
+        ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
-        if (exists) {
+        if (ret == -EEXIST) {
+                exists = radix_tree_lookup(&tree->buffer,
+                                                start >> PAGE_CACHE_SHIFT);
                /* add one reference for the caller */
                atomic_inc(&exists->refs);
                spin_unlock(&tree->buffer_lock);
+                radix_tree_preload_end();
                goto free_eb;
        }
        /* add one reference for the tree */
        atomic_inc(&eb->refs);
        spin_unlock(&tree->buffer_lock);
+        radix_tree_preload_end();
        return eb;
 free_eb:
        if (!atomic_dec_and_test(&eb->refs))
                return exists;
-        for (index = 1; index < i; index++)
+        btrfs_release_extent_buffer(eb);
-                page_cache_release(extent_buffer_page(eb, index));
-        page_cache_release(extent_buffer_page(eb, 0));
-        __free_extent_buffer(eb);
        return exists;
 }
@@ -3194,16 +3181,16 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 {
        struct extent_buffer *eb;
-        spin_lock(&tree->buffer_lock);
+        rcu_read_lock();
-        eb = buffer_search(tree, start);
+        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-        if (eb)
+        if (eb && atomic_inc_not_zero(&eb->refs)) {
-                atomic_inc(&eb->refs);
+                rcu_read_unlock();
-        spin_unlock(&tree->buffer_lock);
-        if (eb)
                mark_page_accessed(eb->first_page);
+                return eb;
+        }
+        rcu_read_unlock();
-        return eb;
+        return NULL;
 }
 void free_extent_buffer(struct extent_buffer *eb)
@@ -3833,34 +3820,45 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
        }
 }
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+        struct extent_buffer *eb =
+                        container_of(head, struct extent_buffer, rcu_head);
+        btrfs_release_extent_buffer(eb);
+}
 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 {
        u64 start = page_offset(page);
        struct extent_buffer *eb;
        int ret = 1;
-        unsigned long i;
-        unsigned long num_pages;
        spin_lock(&tree->buffer_lock);
-        eb = buffer_search(tree, start);
+        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
        if (!eb)
                goto out;
-        if (atomic_read(&eb->refs) > 1) {
+        if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
                ret = 0;
                goto out;
        }
-        if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+        /*
+         * set @eb->refs to 0 if it is already 1, and then release the @eb.
+         * Or go back.
+         */
+        if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
                ret = 0;
                goto out;
        }
-        /* at this point we can safely release the extent buffer */
-        num_pages = num_extent_pages(eb->start, eb->len);
+        radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-        for (i = 0; i < num_pages; i++)
-                page_cache_release(extent_buffer_page(eb, i));
-        rb_erase(&eb->rb_node, &tree->buffer);
-        __free_extent_buffer(eb);
 out:
        spin_unlock(&tree->buffer_lock);
+        /* at this point we can safely release the extent buffer */
+        if (atomic_read(&eb->refs) == 0)
+                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
        return ret;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5691c7b590da..1c6d4f342ef7 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -85,7 +85,7 @@ struct extent_io_ops {
 struct extent_io_tree {
        struct rb_root state;
-        struct rb_root buffer;
+        struct radix_tree_root buffer;
        struct address_space *mapping;
        u64 dirty_bytes;
        spinlock_t lock;
@@ -123,7 +123,7 @@ struct extent_buffer {
        unsigned long bflags;
        atomic_t refs;
        struct list_head leak_list;
-        struct rb_node rb_node;
+        struct rcu_head rcu_head;
        /* the spinlock is used to protect most operations */
        spinlock_t lock;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 454ca52d6451..23cb8da3ff66 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -335,7 +335,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
                goto out;
        }
        if (IS_ERR(rb_node)) {
-                em = ERR_PTR(PTR_ERR(rb_node));
+                em = ERR_CAST(rb_node);
                goto out;
        }
        em = rb_entry(rb_node, struct extent_map, rb_node);
@@ -384,7 +384,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
                goto out;
        }
        if (IS_ERR(rb_node)) {
-                em = ERR_PTR(PTR_ERR(rb_node));
+                em = ERR_CAST(rb_node);
                goto out;
        }
        em = rb_entry(rb_node, struct extent_map, rb_node);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f488fac04d99..22ee0dc2e6b8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -23,10 +23,761 @@
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
+#include "disk-io.h"
 #define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
+static void recalculate_thresholds(struct btrfs_block_group_cache
+                                   *block_group);
+static int link_free_space(struct btrfs_block_group_cache *block_group,
+                           struct btrfs_free_space *info);
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+                                      struct btrfs_block_group_cache
+                                      *block_group, struct btrfs_path *path)
+{
+        struct btrfs_key key;
+        struct btrfs_key location;
+        struct btrfs_disk_key disk_key;
+        struct btrfs_free_space_header *header;
+        struct extent_buffer *leaf;
+        struct inode *inode = NULL;
+        int ret;
+        spin_lock(&block_group->lock);
+        if (block_group->inode)
+                inode = igrab(block_group->inode);
+        spin_unlock(&block_group->lock);
+        if (inode)
+                return inode;
+        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+        key.offset = block_group->key.objectid;
+        key.type = 0;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (ret > 0) {
+                btrfs_release_path(root, path);
+                return ERR_PTR(-ENOENT);
+        }
+        leaf = path->nodes[0];
+        header = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_free_space_header);
+        btrfs_free_space_key(leaf, header, &disk_key);
+        btrfs_disk_key_to_cpu(&location, &disk_key);
+        btrfs_release_path(root, path);
+        inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
+        if (!inode)
+                return ERR_PTR(-ENOENT);
+        if (IS_ERR(inode))
+                return inode;
+        if (is_bad_inode(inode)) {
+                iput(inode);
+                return ERR_PTR(-ENOENT);
+        }
+        spin_lock(&block_group->lock);
+        if (!root->fs_info->closing) {
+                block_group->inode = igrab(inode);
+                block_group->iref = 1;
+        }
+        spin_unlock(&block_group->lock);
+        return inode;
+}
+int create_free_space_inode(struct btrfs_root *root,
+                            struct btrfs_trans_handle *trans,
+                            struct btrfs_block_group_cache *block_group,
+                            struct btrfs_path *path)
+{
+        struct btrfs_key key;
+        struct btrfs_disk_key disk_key;
+        struct btrfs_free_space_header *header;
+        struct btrfs_inode_item *inode_item;
+        struct extent_buffer *leaf;
+        u64 objectid;
+        int ret;
+        ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
+        if (ret < 0)
+                return ret;
+        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+        if (ret)
+                return ret;
+        leaf = path->nodes[0];
+        inode_item = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_inode_item);
+        btrfs_item_key(leaf, &disk_key, path->slots[0]);
+        memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
+                             sizeof(*inode_item));
+        btrfs_set_inode_generation(leaf, inode_item, trans->transid);
+        btrfs_set_inode_size(leaf, inode_item, 0);
+        btrfs_set_inode_nbytes(leaf, inode_item, 0);
+        btrfs_set_inode_uid(leaf, inode_item, 0);
+        btrfs_set_inode_gid(leaf, inode_item, 0);
+        btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
+        btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
+                              BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
+        btrfs_set_inode_nlink(leaf, inode_item, 1);
+        btrfs_set_inode_transid(leaf, inode_item, trans->transid);
+        btrfs_set_inode_block_group(leaf, inode_item,
+                                    block_group->key.objectid);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(root, path);
+        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+        key.offset = block_group->key.objectid;
+        key.type = 0;
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      sizeof(struct btrfs_free_space_header));
+        if (ret < 0) {
+                btrfs_release_path(root, path);
+                return ret;
+        }
+        leaf = path->nodes[0];
+        header = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_free_space_header);
+        memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
+        btrfs_set_free_space_key(leaf, header, &disk_key);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(root, path);
+        return 0;
+}
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+                                    struct btrfs_trans_handle *trans,
+                                    struct btrfs_path *path,
+                                    struct inode *inode)
+{
+        loff_t oldsize;
+        int ret = 0;
+        trans->block_rsv = root->orphan_block_rsv;
+        ret = btrfs_block_rsv_check(trans, root,
+                                    root->orphan_block_rsv,
+                                    0, 5);
+        if (ret)
+                return ret;
+        oldsize = i_size_read(inode);
+        btrfs_i_size_write(inode, 0);
+        truncate_pagecache(inode, oldsize, 0);
+        /*
+         * We don't need an orphan item because truncating the free space cache
+         * will never be split across transactions.
+         */
+        ret = btrfs_truncate_inode_items(trans, root, inode,
+                                         0, BTRFS_EXTENT_DATA_KEY);
+        if (ret) {
+                WARN_ON(1);
+                return ret;
+        }
+        return btrfs_update_inode(trans, root, inode);
+}
+static int readahead_cache(struct inode *inode)
+{
+        struct file_ra_state *ra;
+        unsigned long last_index;
+        ra = kzalloc(sizeof(*ra), GFP_NOFS);
+        if (!ra)
+                return -ENOMEM;
+        file_ra_state_init(ra, inode->i_mapping);
+        last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+        page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
+        kfree(ra);
+        return 0;
+}
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+                          struct btrfs_block_group_cache *block_group)
+{
+        struct btrfs_root *root = fs_info->tree_root;
+        struct inode *inode;
+        struct btrfs_free_space_header *header;
+        struct extent_buffer *leaf;
+        struct page *page;
+        struct btrfs_path *path;
+        u32 *checksums = NULL, *crc;
+        char *disk_crcs = NULL;
+        struct btrfs_key key;
+        struct list_head bitmaps;
+        u64 num_entries;
+        u64 num_bitmaps;
+        u64 generation;
+        u32 cur_crc = ~(u32)0;
+        pgoff_t index = 0;
+        unsigned long first_page_offset;
+        int num_checksums;
+        int ret = 0;
+        /*
+         * If we're unmounting then just return, since this does a search on the
+         * normal root and not the commit root and we could deadlock.
+         */
+        smp_mb();
+        if (fs_info->closing)
+                return 0;
+        /*
+         * If this block group has been marked to be cleared for one reason or
+         * another then we can't trust the on disk cache, so just return.
+         */
+        spin_lock(&block_group->lock);
+        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+                spin_unlock(&block_group->lock);
+                return 0;
+        }
+        spin_unlock(&block_group->lock);
+        INIT_LIST_HEAD(&bitmaps);
+        path = btrfs_alloc_path();
+        if (!path)
+                return 0;
+        inode = lookup_free_space_inode(root, block_group, path);
+        if (IS_ERR(inode)) {
+                btrfs_free_path(path);
+                return 0;
+        }
+        /* Nothing in the space cache, goodbye */
+        if (!i_size_read(inode)) {
+                btrfs_free_path(path);
+                goto out;
+        }
+        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+        key.offset = block_group->key.objectid;
+        key.type = 0;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret) {
+                btrfs_free_path(path);
+                goto out;
+        }
+        leaf = path->nodes[0];
+        header = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_free_space_header);
+        num_entries = btrfs_free_space_entries(leaf, header);
+        num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
+        generation = btrfs_free_space_generation(leaf, header);
+        btrfs_free_path(path);
+        if (BTRFS_I(inode)->generation != generation) {
+                printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
+                       " not match free space cache generation (%llu) for "
+                       "block group %llu\n",
+                       (unsigned long long)BTRFS_I(inode)->generation,
+                       (unsigned long long)generation,
+                       (unsigned long long)block_group->key.objectid);
+                goto out;
+        }
+        if (!num_entries)
+                goto out;
+        /* Setup everything for doing checksumming */
+        num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
+        checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
+        if (!checksums)
+                goto out;
+        first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+        disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
+        if (!disk_crcs)
+                goto out;
+        ret = readahead_cache(inode);
+        if (ret) {
+                ret = 0;
+                goto out;
+        }
+        while (1) {
+                struct btrfs_free_space_entry *entry;
+                struct btrfs_free_space *e;
+                void *addr;
+                unsigned long offset = 0;
+                unsigned long start_offset = 0;
+                int need_loop = 0;
+                if (!num_entries && !num_bitmaps)
+                        break;
+                if (index == 0) {
+                        start_offset = first_page_offset;
+                        offset = start_offset;
+                }
+                page = grab_cache_page(inode->i_mapping, index);
+                if (!page) {
+                        ret = 0;
+                        goto free_cache;
+                }
+                if (!PageUptodate(page)) {
+                        btrfs_readpage(NULL, page);
+                        lock_page(page);
+                        if (!PageUptodate(page)) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                printk(KERN_ERR "btrfs: error reading free "
+                                       "space cache: %llu\n",
+                                       (unsigned long long)
+                                       block_group->key.objectid);
+                                goto free_cache;
+                        }
+                }
+                addr = kmap(page);
+                if (index == 0) {
+                        u64 *gen;
+                        memcpy(disk_crcs, addr, first_page_offset);
+                        gen = addr + (sizeof(u32) * num_checksums);
+                        if (*gen != BTRFS_I(inode)->generation) {
+                                printk(KERN_ERR "btrfs: space cache generation"
+                                       " (%llu) does not match inode (%llu) "
+                                       "for block group %llu\n",
+                                       (unsigned long long)*gen,
+                                       (unsigned long long)
+                                       BTRFS_I(inode)->generation,
+                                       (unsigned long long)
+                                       block_group->key.objectid);
+                                kunmap(page);
+                                unlock_page(page);
+                                page_cache_release(page);
+                                goto free_cache;
+                        }
+                        crc = (u32 *)disk_crcs;
+                }
+                entry = addr + start_offset;
+                /* First lets check our crc before we do anything fun */
+                cur_crc = ~(u32)0;
+                cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
+                                          PAGE_CACHE_SIZE - start_offset);
+                btrfs_csum_final(cur_crc, (char *)&cur_crc);
+                if (cur_crc != *crc) {
+                        printk(KERN_ERR "btrfs: crc mismatch for page %lu in "
+                               "block group %llu\n", index,
+                               (unsigned long long)block_group->key.objectid);
+                        kunmap(page);
+                        unlock_page(page);
+                        page_cache_release(page);
+                        goto free_cache;
+                }
+                crc++;
+                while (1) {
+                        if (!num_entries)
+                                break;
+                        need_loop = 1;
+                        e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+                        if (!e) {
+                                kunmap(page);
+                                unlock_page(page);
+                                page_cache_release(page);
+                                goto free_cache;
+                        }
+                        e->offset = le64_to_cpu(entry->offset);
+                        e->bytes = le64_to_cpu(entry->bytes);
+                        if (!e->bytes) {
+                                kunmap(page);
+                                kfree(e);
+                                unlock_page(page);
+                                page_cache_release(page);
+                                goto free_cache;
+                        }
+                        if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
+                                spin_lock(&block_group->tree_lock);
+                                ret = link_free_space(block_group, e);
+                                spin_unlock(&block_group->tree_lock);
+                                BUG_ON(ret);
+                        } else {
+                                e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+                                if (!e->bitmap) {
+                                        kunmap(page);
+                                        kfree(e);
+                                        unlock_page(page);
+                                        page_cache_release(page);
+                                        goto free_cache;
+                                }
+                                spin_lock(&block_group->tree_lock);
+                                ret = link_free_space(block_group, e);
+                                block_group->total_bitmaps++;
+                                recalculate_thresholds(block_group);
+                                spin_unlock(&block_group->tree_lock);
+                                list_add_tail(&e->list, &bitmaps);
+                        }
+                        num_entries--;
+                        offset += sizeof(struct btrfs_free_space_entry);
+                        if (offset + sizeof(struct btrfs_free_space_entry) >=
+                            PAGE_CACHE_SIZE)
+                                break;
+                        entry++;
+                }
+                /*
+                 * We read an entry out of this page, we need to move on to the
+                 * next page.
+                 */
+                if (need_loop) {
+                        kunmap(page);
+                        goto next;
+                }
+                /*
+                 * We add the bitmaps at the end of the entries in order that
+                 * the bitmap entries are added to the cache.
+                 */
+                e = list_entry(bitmaps.next, struct btrfs_free_space, list);
+                list_del_init(&e->list);
+                memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
+                kunmap(page);
+                num_bitmaps--;
+next:
+                unlock_page(page);
+                page_cache_release(page);
+                index++;
+        }
+        ret = 1;
+out:
+        kfree(checksums);
+        kfree(disk_crcs);
+        iput(inode);
+        return ret;
+free_cache:
+        /* This cache is bogus, make sure it gets cleared */
+        spin_lock(&block_group->lock);
+        block_group->disk_cache_state = BTRFS_DC_CLEAR;
+        spin_unlock(&block_group->lock);
+        btrfs_remove_free_space_cache(block_group);
+        goto out;
+}
+int btrfs_write_out_cache(struct btrfs_root *root,
+                          struct btrfs_trans_handle *trans,
+                          struct btrfs_block_group_cache *block_group,
+                          struct btrfs_path *path)
+{
+        struct btrfs_free_space_header *header;
+        struct extent_buffer *leaf;
+        struct inode *inode;
+        struct rb_node *node;
+        struct list_head *pos, *n;
+        struct page *page;
+        struct extent_state *cached_state = NULL;
+        struct list_head bitmap_list;
+        struct btrfs_key key;
+        u64 bytes = 0;
+        u32 *crc, *checksums;
+        pgoff_t index = 0, last_index = 0;
+        unsigned long first_page_offset;
+        int num_checksums;
+        int entries = 0;
+        int bitmaps = 0;
+        int ret = 0;
+        root = root->fs_info->tree_root;
+        INIT_LIST_HEAD(&bitmap_list);
+        spin_lock(&block_group->lock);
+        if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+                spin_unlock(&block_group->lock);
+                return 0;
+        }
+        spin_unlock(&block_group->lock);
+        inode = lookup_free_space_inode(root, block_group, path);
+        if (IS_ERR(inode))
+                return 0;
+        if (!i_size_read(inode)) {
+                iput(inode);
+                return 0;
+        }
+        last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+        filemap_write_and_wait(inode->i_mapping);
+        btrfs_wait_ordered_range(inode, inode->i_size &
+                                 ~(root->sectorsize - 1), (u64)-1);
+        /* We need a checksum per page. */
+        num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
+        crc = checksums  = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
+        if (!crc) {
+                iput(inode);
+                return 0;
+        }
+        /* Since the first page has all of our checksums and our generation we
+         * need to calculate the offset into the page that we can start writing
+         * our entries.
+         */
+        first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+        node = rb_first(&block_group->free_space_offset);
+        if (!node)
+                goto out_free;
+        /*
+         * Lock all pages first so we can lock the extent safely.
+         *
+         * NOTE: Because we hold the ref the entire time we're going to write to
+         * the page find_get_page should never fail, so we don't do a check
+         * after find_get_page at this point.  Just putting this here so people
+         * know and don't freak out.
+         */
+        while (index <= last_index) {
+                page = grab_cache_page(inode->i_mapping, index);
+                if (!page) {
+                        pgoff_t i = 0;
+                        while (i < index) {
+                                page = find_get_page(inode->i_mapping, i);
+                                unlock_page(page);
+                                page_cache_release(page);
+                                page_cache_release(page);
+                                i++;
+                        }
+                        goto out_free;
+                }
+                index++;
+        }
+        index = 0;
+        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+                         0, &cached_state, GFP_NOFS);
+        /* Write out the extent entries */
+        do {
+                struct btrfs_free_space_entry *entry;
+                void *addr;
+                unsigned long offset = 0;
+                unsigned long start_offset = 0;
+                if (index == 0) {
+                        start_offset = first_page_offset;
+                        offset = start_offset;
+                }
+                page = find_get_page(inode->i_mapping, index);
+                addr = kmap(page);
+                entry = addr + start_offset;
+                memset(addr, 0, PAGE_CACHE_SIZE);
+                while (1) {
+                        struct btrfs_free_space *e;
+                        e = rb_entry(node, struct btrfs_free_space, offset_index);
+                        entries++;
+                        entry->offset = cpu_to_le64(e->offset);
+                        entry->bytes = cpu_to_le64(e->bytes);
+                        if (e->bitmap) {
+                                entry->type = BTRFS_FREE_SPACE_BITMAP;
+                                list_add_tail(&e->list, &bitmap_list);
+                                bitmaps++;
+                        } else {
+                                entry->type = BTRFS_FREE_SPACE_EXTENT;
+                        }
+                        node = rb_next(node);
+                        if (!node)
+                                break;
+                        offset += sizeof(struct btrfs_free_space_entry);
+                        if (offset + sizeof(struct btrfs_free_space_entry) >=
+                            PAGE_CACHE_SIZE)
+                                break;
+                        entry++;
+                }
+                *crc = ~(u32)0;
+                *crc = btrfs_csum_data(root, addr + start_offset, *crc,
+                                       PAGE_CACHE_SIZE - start_offset);
+                kunmap(page);
+                btrfs_csum_final(*crc, (char *)crc);
+                crc++;
+                bytes += PAGE_CACHE_SIZE;
+                ClearPageChecked(page);
+                set_page_extent_mapped(page);
+                SetPageUptodate(page);
+                set_page_dirty(page);
+                /*
+                 * We need to release our reference we got for grab_cache_page,
+                 * except for the first page which will hold our checksums, we
+                 * do that below.
+                 */
+                if (index != 0) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                }
+                page_cache_release(page);
+                index++;
+        } while (node);
+        /* Write out the bitmaps */
+        list_for_each_safe(pos, n, &bitmap_list) {
+                void *addr;
+                struct btrfs_free_space *entry =
+                        list_entry(pos, struct btrfs_free_space, list);
+                page = find_get_page(inode->i_mapping, index);
+                addr = kmap(page);
+                memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
+                *crc = ~(u32)0;
+                *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
+                kunmap(page);
+                btrfs_csum_final(*crc, (char *)crc);
+                crc++;
+                bytes += PAGE_CACHE_SIZE;
+                ClearPageChecked(page);
+                set_page_extent_mapped(page);
+                SetPageUptodate(page);
+                set_page_dirty(page);
+                unlock_page(page);
+                page_cache_release(page);
+                page_cache_release(page);
+                list_del_init(&entry->list);
+                index++;
+        }
+        /* Zero out the rest of the pages just to make sure */
+        while (index <= last_index) {
+                void *addr;
+                page = find_get_page(inode->i_mapping, index);
+                addr = kmap(page);
+                memset(addr, 0, PAGE_CACHE_SIZE);
+                kunmap(page);
+                ClearPageChecked(page);
+                set_page_extent_mapped(page);
+                SetPageUptodate(page);
+                set_page_dirty(page);
+                unlock_page(page);
+                page_cache_release(page);
+                page_cache_release(page);
+                bytes += PAGE_CACHE_SIZE;
+                index++;
+        }
+        btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
+        /* Write the checksums and trans id to the first page */
+        {
+                void *addr;
+                u64 *gen;
+                page = find_get_page(inode->i_mapping, 0);
+                addr = kmap(page);
+                memcpy(addr, checksums, sizeof(u32) * num_checksums);
+                gen = addr + (sizeof(u32) * num_checksums);
+                *gen = trans->transid;
+                kunmap(page);
+                ClearPageChecked(page);
+                set_page_extent_mapped(page);
+                SetPageUptodate(page);
+                set_page_dirty(page);
+                unlock_page(page);
+                page_cache_release(page);
+                page_cache_release(page);
+        }
+        BTRFS_I(inode)->generation = trans->transid;
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+                             i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+        filemap_write_and_wait(inode->i_mapping);
+        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+        key.offset = block_group->key.objectid;
+        key.type = 0;
+        ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+        if (ret < 0) {
+                ret = 0;
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+                                 EXTENT_DIRTY | EXTENT_DELALLOC |
+                                 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
+                goto out_free;
+        }
+        leaf = path->nodes[0];
+        if (ret > 0) {
+                struct btrfs_key found_key;
+                BUG_ON(!path->slots[0]);
+                path->slots[0]--;
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
+                    found_key.offset != block_group->key.objectid) {
+                        ret = 0;
+                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+                                         EXTENT_DIRTY | EXTENT_DELALLOC |
+                                         EXTENT_DO_ACCOUNTING, 0, 0, NULL,
+                                         GFP_NOFS);
+                        btrfs_release_path(root, path);
+                        goto out_free;
+                }
+        }
+        header = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_free_space_header);
+        btrfs_set_free_space_entries(leaf, header, entries);
+        btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
+        btrfs_set_free_space_generation(leaf, header, trans->transid);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(root, path);
+        ret = 1;
+out_free:
+        if (ret == 0) {
+                invalidate_inode_pages2_range(inode->i_mapping, 0, index);
+                spin_lock(&block_group->lock);
+                block_group->disk_cache_state = BTRFS_DC_ERROR;
+                spin_unlock(&block_group->lock);
+                BTRFS_I(inode)->generation = 0;
+        }
+        kfree(checksums);
+        btrfs_update_inode(trans, root, inode);
+        iput(inode);
+        return ret;
+}
 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
                                          u64 offset)
 {
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 890a8e79011b..e49ca5c321b5 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -27,6 +27,24 @@ struct btrfs_free_space {
        struct list_head list;
 };
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+                                      struct btrfs_block_group_cache
+                                      *block_group, struct btrfs_path *path);
+int create_free_space_inode(struct btrfs_root *root,
+                            struct btrfs_trans_handle *trans,
+                            struct btrfs_block_group_cache *block_group,
+                            struct btrfs_path *path);
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+                                    struct btrfs_trans_handle *trans,
+                                    struct btrfs_path *path,
+                                    struct inode *inode);
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+                          struct btrfs_block_group_cache *block_group);
+int btrfs_write_out_cache(struct btrfs_root *root,
+                          struct btrfs_trans_handle *trans,
+                          struct btrfs_block_group_cache *block_group,
+                          struct btrfs_path *path);
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
                         u64 bytenr, u64 size);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c03864406af3..558cac2dfa54 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -319,8 +319,6 @@ static noinline int compress_file_range(struct inode *inode,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        u64 num_bytes;
-        u64 orig_start;
-        u64 disk_num_bytes;
        u64 blocksize = root->sectorsize;
        u64 actual_end;
        u64 isize = i_size_read(inode);
@@ -335,8 +333,6 @@ static noinline int compress_file_range(struct inode *inode,
        int i;
        int will_compress;
-        orig_start = start;
        actual_end = min_t(u64, isize, end + 1);
 again:
        will_compress = 0;
@@ -371,7 +367,6 @@ again:
        total_compressed = min(total_compressed, max_uncompressed);
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
        num_bytes = max(blocksize,  num_bytes);
-        disk_num_bytes = num_bytes;
        total_in = 0;
        ret = 0;
@@ -467,7 +462,6 @@ again:
                if (total_compressed >= total_in) {
                        will_compress = 0;
                } else {
-                        disk_num_bytes = total_compressed;
                        num_bytes = total_in;
                }
        }
@@ -757,20 +751,17 @@ static noinline int cow_file_range(struct inode *inode,
        u64 disk_num_bytes;
        u64 cur_alloc_size;
        u64 blocksize = root->sectorsize;
-        u64 actual_end;
-        u64 isize = i_size_read(inode);
        struct btrfs_key ins;
        struct extent_map *em;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        int ret = 0;
+        BUG_ON(root == root->fs_info->tree_root);
        trans = btrfs_join_transaction(root, 1);
        BUG_ON(!trans);
        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-        actual_end = min_t(u64, isize, end + 1);
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
        num_bytes = max(blocksize,  num_bytes);
        disk_num_bytes = num_bytes;
@@ -1035,10 +1026,16 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        int type;
        int nocow;
        int check_prev = 1;
+        bool nolock = false;
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        trans = btrfs_join_transaction(root, 1);
+        if (root == root->fs_info->tree_root) {
+                nolock = true;
+                trans = btrfs_join_transaction_nolock(root, 1);
+        } else {
+                trans = btrfs_join_transaction(root, 1);
+        }
        BUG_ON(!trans);
        cow_start = (u64)-1;
@@ -1211,8 +1208,13 @@ out_check:
                BUG_ON(ret);
        }
-        ret = btrfs_end_transaction(trans, root);
+        if (nolock) {
-        BUG_ON(ret);
+                ret = btrfs_end_transaction_nolock(trans, root);
+                BUG_ON(ret);
+        } else {
+                ret = btrfs_end_transaction(trans, root);
+                BUG_ON(ret);
+        }
        btrfs_free_path(path);
        return 0;
 }
@@ -1289,6 +1291,8 @@ static int btrfs_set_bit_hook(struct inode *inode,
        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
                u64 len = state->end + 1 - state->start;
+                int do_list = (root->root_key.objectid !=
+                               BTRFS_ROOT_TREE_OBJECTID);
                if (*bits & EXTENT_FIRST_DELALLOC)
                        *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1298,7 +1302,7 @@ static int btrfs_set_bit_hook(struct inode *inode,
                spin_lock(&root->fs_info->delalloc_lock);
                BTRFS_I(inode)->delalloc_bytes += len;
                root->fs_info->delalloc_bytes += len;
-                if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
                                      &root->fs_info->delalloc_inodes);
                }
@@ -1321,6 +1325,8 @@ static int btrfs_clear_bit_hook(struct inode *inode,
        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
                u64 len = state->end + 1 - state->start;
+                int do_list = (root->root_key.objectid !=
+                               BTRFS_ROOT_TREE_OBJECTID);
                if (*bits & EXTENT_FIRST_DELALLOC)
                        *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1330,14 +1336,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,
                if (*bits & EXTENT_DO_ACCOUNTING)
                        btrfs_delalloc_release_metadata(inode, len);
-                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+                    && do_list)
                        btrfs_free_reserved_data_space(inode, len);
                spin_lock(&root->fs_info->delalloc_lock);
                root->fs_info->delalloc_bytes -= len;
                BTRFS_I(inode)->delalloc_bytes -= len;
-                if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+                if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
                    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_del_init(&BTRFS_I(inode)->delalloc_inodes);
                }
@@ -1372,7 +1379,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
        if (map_length < length + size)
                return 1;
-        return 0;
+        return ret;
 }
 /*
@@ -1426,7 +1433,10 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        if (root == root->fs_info->tree_root)
+                ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
+        else
+                ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
        BUG_ON(ret);
        if (!(rw & REQ_WRITE)) {
@@ -1662,6 +1672,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        struct extent_state *cached_state = NULL;
        int compressed = 0;
        int ret;
+        bool nolock = false;
        ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
                                             end - start + 1);
@@ -1669,11 +1680,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                return 0;
        BUG_ON(!ordered_extent);
+        nolock = (root == root->fs_info->tree_root);
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list));
                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (!ret) {
-                        trans = btrfs_join_transaction(root, 1);
+                        if (nolock)
+                                trans = btrfs_join_transaction_nolock(root, 1);
+                        else
+                                trans = btrfs_join_transaction(root, 1);
+                        BUG_ON(!trans);
                        btrfs_set_trans_block_group(trans, inode);
                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode(trans, root, inode);
@@ -1686,7 +1703,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                         ordered_extent->file_offset + ordered_extent->len - 1,
                         0, &cached_state, GFP_NOFS);
-        trans = btrfs_join_transaction(root, 1);
+        if (nolock)
+                trans = btrfs_join_transaction_nolock(root, 1);
+        else
+                trans = btrfs_join_transaction(root, 1);
        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -1700,6 +1720,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                                ordered_extent->len);
                BUG_ON(ret);
        } else {
+                BUG_ON(root == root->fs_info->tree_root);
                ret = insert_reserved_file_extent(trans, inode,
                                                ordered_extent->file_offset,
                                                ordered_extent->start,
@@ -1724,9 +1745,15 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
 out:
-        btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+        if (nolock) {
-        if (trans)
+                if (trans)
-                btrfs_end_transaction(trans, root);
+                        btrfs_end_transaction_nolock(trans, root);
+        } else {
+                btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+                if (trans)
+                        btrfs_end_transaction(trans, root);
+        }
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
@@ -2237,7 +2264,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 {
        struct btrfs_path *path;
        struct extent_buffer *leaf;
-        struct btrfs_item *item;
        struct btrfs_key key, found_key;
        struct btrfs_trans_handle *trans;
        struct inode *inode;
@@ -2275,7 +2301,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                /* pull out the item */
                leaf = path->nodes[0];
-                item = btrfs_item_nr(leaf, path->slots[0]);
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                /* make sure the item matches what we want */
@@ -2651,7 +2676,8 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
                                           dir, index);
-        BUG_ON(ret);
+        if (ret == -ENOENT)
+                ret = 0;
 err:
        btrfs_free_path(path);
        if (ret)
@@ -2672,8 +2698,8 @@ static int check_path_shared(struct btrfs_root *root,
 {
        struct extent_buffer *eb;
        int level;
-        int ret;
        u64 refs = 1;
+        int uninitialized_var(ret);
        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
                if (!path->nodes[level])
@@ -2686,7 +2712,7 @@ static int check_path_shared(struct btrfs_root *root,
                if (refs > 1)
                        return 1;
        }
-        return 0;
+        return ret; /* XXX callers? */
 }
 /*
@@ -3196,7 +3222,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
-        if (root->ref_cows)
+        if (root->ref_cows || root == root->fs_info->tree_root)
                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
        path = btrfs_alloc_path();
@@ -3344,7 +3370,8 @@ delete:
                } else {
                        break;
                }
-                if (found_extent && root->ref_cows) {
+                if (found_extent && (root->ref_cows ||
+                                     root == root->fs_info->tree_root)) {
                        btrfs_set_path_blocking(path);
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes, 0,
@@ -3675,7 +3702,8 @@ void btrfs_evict_inode(struct inode *inode)
        int ret;
        truncate_inode_pages(&inode->i_data, 0);
-        if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0)
+        if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
+                               root == root->fs_info->tree_root))
                goto no_delete;
        if (is_bad_inode(inode)) {
@@ -3849,7 +3877,7 @@ again:
        p = &root->inode_tree.rb_node;
        parent = NULL;
-        if (hlist_unhashed(&inode->i_hash))
+        if (inode_unhashed(inode))
                return;
        spin_lock(&root->inode_lock);
@@ -3888,7 +3916,14 @@ static void inode_tree_del(struct inode *inode)
        }
        spin_unlock(&root->inode_lock);
-        if (empty && btrfs_root_refs(&root->root_item) == 0) {
+        /*
+         * Free space cache has inodes in the tree root, but the tree root has a
+         * root_refs of 0, so this could end up dropping the tree root as a
+         * snapshot, so we need the extra !root->fs_info->tree_root check to
+         * make sure we don't drop it.
+         */
+        if (empty && btrfs_root_refs(&root->root_item) == 0 &&
+            root != root->fs_info->tree_root) {
                synchronize_srcu(&root->fs_info->subvol_srcu);
                spin_lock(&root->inode_lock);
                empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -4282,14 +4317,24 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        int ret = 0;
+        bool nolock = false;
        if (BTRFS_I(inode)->dummy_inode)
                return 0;
+        smp_mb();
+        nolock = (root->fs_info->closing && root == root->fs_info->tree_root);
        if (wbc->sync_mode == WB_SYNC_ALL) {
-                trans = btrfs_join_transaction(root, 1);
+                if (nolock)
+                        trans = btrfs_join_transaction_nolock(root, 1);
+                else
+                        trans = btrfs_join_transaction(root, 1);
                btrfs_set_trans_block_group(trans, inode);
-                ret = btrfs_commit_transaction(trans, root);
+                if (nolock)
+                        ret = btrfs_end_transaction_nolock(trans, root);
+                else
+                        ret = btrfs_commit_transaction(trans, root);
        }
        return ret;
 }
@@ -4758,7 +4803,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        }
        btrfs_set_trans_block_group(trans, dir);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = btrfs_add_nondir(trans, dentry, inode, 1, index);
@@ -5645,7 +5690,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_dio_private *dip;
        struct bio_vec *bvec = bio->bi_io_vec;
-        u64 start;
        int skip_sum;
        int write = rw & REQ_WRITE;
        int ret = 0;
@@ -5671,7 +5715,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        dip->inode = inode;
        dip->logical_offset = file_offset;
-        start = dip->logical_offset;
        dip->bytes = 0;
        do {
                dip->bytes += bvec->bv_len;
@@ -6308,6 +6351,21 @@ void btrfs_destroy_inode(struct inode *inode)
                spin_unlock(&root->fs_info->ordered_extent_lock);
        }
+        if (root == root->fs_info->tree_root) {
+                struct btrfs_block_group_cache *block_group;
+                block_group = btrfs_lookup_block_group(root->fs_info,
+                                                BTRFS_I(inode)->block_group);
+                if (block_group && block_group->inode == inode) {
+                        spin_lock(&block_group->lock);
+                        block_group->inode = NULL;
+                        spin_unlock(&block_group->lock);
+                        btrfs_put_block_group(block_group);
+                } else if (block_group) {
+                        btrfs_put_block_group(block_group);
+                }
+        }
        spin_lock(&root->orphan_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
@@ -6340,7 +6398,8 @@ int btrfs_drop_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        if (btrfs_root_refs(&root->root_item) == 0)
+        if (btrfs_root_refs(&root->root_item) == 0 &&
+            root != root->fs_info->tree_root)
                return 1;
        else
                return generic_drop_inode(inode);
@@ -6609,7 +6668,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        return 0;
 }
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
+                                   int sync)
 {
        struct btrfs_inode *binode;
        struct inode *inode = NULL;
@@ -6631,7 +6691,26 @@ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
        spin_unlock(&root->fs_info->delalloc_lock);
        if (inode) {
-                write_inode_now(inode, 0);
+                if (sync) {
+                        filemap_write_and_wait(inode->i_mapping);
+                        /*
+                         * We have to do this because compression doesn't
+                         * actually set PG_writeback until it submits the pages
+                         * for IO, which happens in an async thread, so we could
+                         * race and not actually wait for any writeback pages
+                         * because they've not been submitted yet.  Technically
+                         * this could still be the case for the ordered stuff
+                         * since the async thread may not have started to do its
+                         * work yet.  If this becomes the case then we need to
+                         * figure out a way to make sure that in writepage we
+                         * wait for any async pages to be submitted before
+                         * returning so that fdatawait does what its supposed to
+                         * do.
+                         */
+                        btrfs_wait_ordered_range(inode, 0, (u64)-1);
+                } else {
+                        filemap_flush(inode->i_mapping);
+                }
                if (delay_iput)
                        btrfs_add_delayed_iput(inode);
                else
@@ -6757,27 +6836,33 @@ out_unlock:
        return err;
 }
-int btrfs_prealloc_file_range(struct inode *inode, int mode,
+static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
-                              u64 start, u64 num_bytes, u64 min_size,
+                                       u64 start, u64 num_bytes, u64 min_size,
-                              loff_t actual_len, u64 *alloc_hint)
+                                       loff_t actual_len, u64 *alloc_hint,
+                                       struct btrfs_trans_handle *trans)
 {
-        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
        int ret = 0;
+        bool own_trans = true;
+        if (trans)
+                own_trans = false;
        while (num_bytes > 0) {
-                trans = btrfs_start_transaction(root, 3);
+                if (own_trans) {
-                if (IS_ERR(trans)) {
+                        trans = btrfs_start_transaction(root, 3);
-                        ret = PTR_ERR(trans);
+                        if (IS_ERR(trans)) {
-                        break;
+                                ret = PTR_ERR(trans);
+                                break;
+                        }
                }
                ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
                                           0, *alloc_hint, (u64)-1, &ins, 1);
                if (ret) {
-                        btrfs_end_transaction(trans, root);
+                        if (own_trans)
+                                btrfs_end_transaction(trans, root);
                        break;
                }
@@ -6810,11 +6895,30 @@ int btrfs_prealloc_file_range(struct inode *inode, int mode,
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
-                btrfs_end_transaction(trans, root);
+                if (own_trans)
+                        btrfs_end_transaction(trans, root);
        }
        return ret;
 }
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+                              u64 start, u64 num_bytes, u64 min_size,
+                              loff_t actual_len, u64 *alloc_hint)
+{
+        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+                                           min_size, actual_len, alloc_hint,
+                                           NULL);
+}
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+                                    struct btrfs_trans_handle *trans, int mode,
+                                    u64 start, u64 num_bytes, u64 min_size,
+                                    loff_t actual_len, u64 *alloc_hint)
+{
+        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+                                           min_size, actual_len, alloc_hint, trans);
+}
 static long btrfs_fallocate(struct inode *inode, int mode,
                            loff_t offset, loff_t len)
 {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9254b3d58dbe..463d91b4dd3a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -224,7 +224,8 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 static noinline int create_subvol(struct btrfs_root *root,
                                  struct dentry *dentry,
-                                  char *name, int namelen)
+                                  char *name, int namelen,
+                                  u64 *async_transid)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_key key;
@@ -338,13 +339,19 @@ static noinline int create_subvol(struct btrfs_root *root,
        d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
-        err = btrfs_commit_transaction(trans, root);
+        if (async_transid) {
+                *async_transid = trans->transid;
+                err = btrfs_commit_transaction_async(trans, root, 1);
+        } else {
+                err = btrfs_commit_transaction(trans, root);
+        }
        if (err && !ret)
                ret = err;
        return ret;
 }
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+                           char *name, int namelen, u64 *async_transid)
 {
        struct inode *inode;
        struct btrfs_pending_snapshot *pending_snapshot;
@@ -373,7 +380,14 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
        list_add(&pending_snapshot->list,
                 &trans->transaction->pending_snapshots);
-        ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
+        if (async_transid) {
+                *async_transid = trans->transid;
+                ret = btrfs_commit_transaction_async(trans,
+                                     root->fs_info->extent_root, 1);
+        } else {
+                ret = btrfs_commit_transaction(trans,
+                                               root->fs_info->extent_root);
+        }
        BUG_ON(ret);
        ret = pending_snapshot->error;
@@ -395,6 +409,76 @@ fail:
        return ret;
 }
+/*  copy of check_sticky in fs/namei.c()
+* It's inline, so penalty for filesystems that don't use sticky bit is
+* minimal.
+*/
+static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
+{
+        uid_t fsuid = current_fsuid();
+        if (!(dir->i_mode & S_ISVTX))
+                return 0;
+        if (inode->i_uid == fsuid)
+                return 0;
+        if (dir->i_uid == fsuid)
+                return 0;
+        return !capable(CAP_FOWNER);
+}
+/*  copy of may_delete in fs/namei.c()
+ *      Check whether we can remove a link victim from directory dir, check
+ *  whether the type of victim is right.
+ *  1. We can't do it if dir is read-only (done in permission())
+ *  2. We should have write and exec permissions on dir
+ *  3. We can't remove anything from append-only dir
+ *  4. We can't do anything with immutable dir (done in permission())
+ *  5. If the sticky bit on dir is set we should either
+ *      a. be owner of dir, or
+ *      b. be owner of victim, or
+ *      c. have CAP_FOWNER capability
+ *  6. If the victim is append-only or immutable we can't do antyhing with
+ *     links pointing to it.
+ *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ *  9. We can't remove a root or mountpoint.
+ * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ *     nfs_async_unlink().
+ */
+static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
+{
+        int error;
+        if (!victim->d_inode)
+                return -ENOENT;
+        BUG_ON(victim->d_parent->d_inode != dir);
+        audit_inode_child(victim, dir);
+        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+        if (error)
+                return error;
+        if (IS_APPEND(dir))
+                return -EPERM;
+        if (btrfs_check_sticky(dir, victim->d_inode)||
+                IS_APPEND(victim->d_inode)||
+            IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+                return -EPERM;
+        if (isdir) {
+                if (!S_ISDIR(victim->d_inode->i_mode))
+                        return -ENOTDIR;
+                if (IS_ROOT(victim))
+                        return -EBUSY;
+        } else if (S_ISDIR(victim->d_inode->i_mode))
+                return -EISDIR;
+        if (IS_DEADDIR(dir))
+                return -ENOENT;
+        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
+                return -EBUSY;
+        return 0;
+}
 /* copy of may_create in fs/namei.c() */
 static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 {
@@ -412,7 +496,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 */
 static noinline int btrfs_mksubvol(struct path *parent,
                                   char *name, int namelen,
-                                   struct btrfs_root *snap_src)
+                                   struct btrfs_root *snap_src,
+                                   u64 *async_transid)
 {
        struct inode *dir  = parent->dentry->d_inode;
        struct dentry *dentry;
@@ -443,10 +528,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
                goto out_up_read;
        if (snap_src) {
-                error = create_snapshot(snap_src, dentry);
+                error = create_snapshot(snap_src, dentry,
+                                        name, namelen, async_transid);
        } else {
                error = create_subvol(BTRFS_I(dir)->root, dentry,
-                                      name, namelen);
+                                      name, namelen, async_transid);
        }
        if (!error)
                fsnotify_mkdir(dir, dentry);
@@ -708,7 +794,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        char *sizestr;
        char *devstr = NULL;
        int ret = 0;
-        int namelen;
        int mod = 0;
        if (root->fs_info->sb->s_flags & MS_RDONLY)
@@ -722,7 +807,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                return PTR_ERR(vol_args);
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-        namelen = strlen(vol_args->name);
        mutex_lock(&root->fs_info->volume_mutex);
        sizestr = vol_args->name;
@@ -801,11 +885,13 @@ out_unlock:
        return ret;
 }
-static noinline int btrfs_ioctl_snap_create(struct file *file,
+static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
-                                            void __user *arg, int subvol)
+                                                    char *name,
+                                                    unsigned long fd,
+                                                    int subvol,
+                                                    u64 *transid)
 {
        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
-        struct btrfs_ioctl_vol_args *vol_args;
        struct file *src_file;
        int namelen;
        int ret = 0;
@@ -813,23 +899,18 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
-        vol_args = memdup_user(arg, sizeof(*vol_args));
+        namelen = strlen(name);
-        if (IS_ERR(vol_args))
+        if (strchr(name, '/')) {
-                return PTR_ERR(vol_args);
-        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-        namelen = strlen(vol_args->name);
-        if (strchr(vol_args->name, '/')) {
                ret = -EINVAL;
                goto out;
        }
        if (subvol) {
-                ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+                ret = btrfs_mksubvol(&file->f_path, name, namelen,
-                                     NULL);
+                                     NULL, transid);
        } else {
                struct inode *src_inode;
-                src_file = fget(vol_args->fd);
+                src_file = fget(fd);
                if (!src_file) {
                        ret = -EINVAL;
                        goto out;
@@ -843,12 +924,56 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
                        fput(src_file);
                        goto out;
                }
-                ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+                ret = btrfs_mksubvol(&file->f_path, name, namelen,
-                                     BTRFS_I(src_inode)->root);
+                                     BTRFS_I(src_inode)->root,
+                                     transid);
                fput(src_file);
        }
 out:
+        return ret;
+}
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+                                            void __user *arg, int subvol,
+                                            int async)
+{
+        struct btrfs_ioctl_vol_args *vol_args = NULL;
+        struct btrfs_ioctl_async_vol_args *async_vol_args = NULL;
+        char *name;
+        u64 fd;
+        u64 transid = 0;
+        int ret;
+        if (async) {
+                async_vol_args = memdup_user(arg, sizeof(*async_vol_args));
+                if (IS_ERR(async_vol_args))
+                        return PTR_ERR(async_vol_args);
+                name = async_vol_args->name;
+                fd = async_vol_args->fd;
+                async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0';
+        } else {
+                vol_args = memdup_user(arg, sizeof(*vol_args));
+                if (IS_ERR(vol_args))
+                        return PTR_ERR(vol_args);
+                name = vol_args->name;
+                fd = vol_args->fd;
+                vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+        }
+        ret = btrfs_ioctl_snap_create_transid(file, name, fd,
+                                              subvol, &transid);
+        if (!ret && async) {
+                if (copy_to_user(arg +
+                                offsetof(struct btrfs_ioctl_async_vol_args,
+                                transid), &transid, sizeof(transid)))
+                        return -EFAULT;
+        }
        kfree(vol_args);
+        kfree(async_vol_args);
        return ret;
 }
@@ -1073,14 +1198,10 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        args = kmalloc(sizeof(*args), GFP_KERNEL);
+        args = memdup_user(argp, sizeof(*args));
-        if (!args)
+        if (IS_ERR(args))
-                return -ENOMEM;
+                return PTR_ERR(args);
-        if (copy_from_user(args, argp, sizeof(*args))) {
-                kfree(args);
-                return -EFAULT;
-        }
        inode = fdentry(file)->d_inode;
        ret = search_ioctl(inode, args);
        if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
@@ -1188,14 +1309,10 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        args = kmalloc(sizeof(*args), GFP_KERNEL);
+        args = memdup_user(argp, sizeof(*args));
-        if (!args)
+        if (IS_ERR(args))
-                return -ENOMEM;
+                return PTR_ERR(args);
-        if (copy_from_user(args, argp, sizeof(*args))) {
-                kfree(args);
-                return -EFAULT;
-        }
        inode = fdentry(file)->d_inode;
        if (args->treeid == 0)
@@ -1227,9 +1344,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        int ret;
        int err = 0;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args))
                return PTR_ERR(vol_args);
@@ -1259,13 +1373,51 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        }
        inode = dentry->d_inode;
+        dest = BTRFS_I(inode)->root;
+        if (!capable(CAP_SYS_ADMIN)){
+                /*
+                 * Regular user.  Only allow this with a special mount
+                 * option, when the user has write+exec access to the
+                 * subvol root, and when rmdir(2) would have been
+                 * allowed.
+                 *
+                 * Note that this is _not_ check that the subvol is
+                 * empty or doesn't contain data that we wouldn't
+                 * otherwise be able to delete.
+                 *
+                 * Users who want to delete empty subvols should try
+                 * rmdir(2).
+                 */
+                err = -EPERM;
+                if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+                        goto out_dput;
+                /*
+                 * Do not allow deletion if the parent dir is the same
+                 * as the dir to be deleted.  That means the ioctl
+                 * must be called on the dentry referencing the root
+                 * of the subvol, not a random directory contained
+                 * within it.
+                 */
+                err = -EINVAL;
+                if (root == dest)
+                        goto out_dput;
+                err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
+                if (err)
+                        goto out_dput;
+                /* check if subvolume may be deleted by a non-root user */
+                err = btrfs_may_delete(dir, dentry, 1);
+                if (err)
+                        goto out_dput;
+        }
        if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
                err = -EINVAL;
                goto out_dput;
        }
-        dest = BTRFS_I(inode)->root;
        mutex_lock(&inode->i_mutex);
        err = d_invalidate(dentry);
        if (err)
@@ -1304,7 +1456,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                BUG_ON(ret);
        }
-        ret = btrfs_commit_transaction(trans, root);
+        ret = btrfs_end_transaction(trans, root);
        BUG_ON(ret);
        inode->i_flags |= S_DEAD;
 out_up_write:
@@ -1502,11 +1654,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        path->reada = 2;
        if (inode < src) {
-                mutex_lock(&inode->i_mutex);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-                mutex_lock(&src->i_mutex);
+                mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
        } else {
-                mutex_lock(&src->i_mutex);
+                mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
-                mutex_lock(&inode->i_mutex);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
        }
        /* determine range to clone */
@@ -1530,13 +1682,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        while (1) {
                struct btrfs_ordered_extent *ordered;
                lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-                ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
+                ordered = btrfs_lookup_first_ordered_extent(src, off+len);
-                if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
+                if (!ordered &&
+                    !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
+                                   EXTENT_DELALLOC, 0, NULL))
                        break;
                unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
-                btrfs_wait_ordered_range(src, off, off+len);
+                btrfs_wait_ordered_range(src, off, len);
        }
        /* clone data */
@@ -1605,7 +1759,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        }
                        btrfs_release_path(root, path);
-                        if (key.offset + datal < off ||
+                        if (key.offset + datal <= off ||
                            key.offset >= off+len)
                                goto next;
@@ -1879,6 +2033,22 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        return 0;
 }
+static void get_block_group_info(struct list_head *groups_list,
+                                 struct btrfs_ioctl_space_info *space)
+{
+        struct btrfs_block_group_cache *block_group;
+        space->total_bytes = 0;
+        space->used_bytes = 0;
+        space->flags = 0;
+        list_for_each_entry(block_group, groups_list, list) {
+                space->flags = block_group->flags;
+                space->total_bytes += block_group->key.offset;
+                space->used_bytes +=
+                        btrfs_block_group_used(&block_group->item);
+        }
+}
 long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 {
        struct btrfs_ioctl_space_args space_args;
@@ -1887,27 +2057,56 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
        struct btrfs_ioctl_space_info *dest_orig;
        struct btrfs_ioctl_space_info *user_dest;
        struct btrfs_space_info *info;
+        u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+                       BTRFS_BLOCK_GROUP_SYSTEM,
+                       BTRFS_BLOCK_GROUP_METADATA,
+                       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+        int num_types = 4;
        int alloc_size;
        int ret = 0;
        int slot_count = 0;
+        int i, c;
        if (copy_from_user(&space_args,
                           (struct btrfs_ioctl_space_args __user *)arg,
                           sizeof(space_args)))
                return -EFAULT;
-        /* first we count slots */
+        for (i = 0; i < num_types; i++) {
-        rcu_read_lock();
+                struct btrfs_space_info *tmp;
-        list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
-                slot_count++;
+                info = NULL;
-        rcu_read_unlock();
+                rcu_read_lock();
+                list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+                                        list) {
+                        if (tmp->flags == types[i]) {
+                                info = tmp;
+                                break;
+                        }
+                }
+                rcu_read_unlock();
+                if (!info)
+                        continue;
+                down_read(&info->groups_sem);
+                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+                        if (!list_empty(&info->block_groups[c]))
+                                slot_count++;
+                }
+                up_read(&info->groups_sem);
+        }
        /* space_slots == 0 means they are asking for a count */
        if (space_args.space_slots == 0) {
                space_args.total_spaces = slot_count;
                goto out;
        }
+        slot_count = min_t(int, space_args.space_slots, slot_count);
        alloc_size = sizeof(*dest) * slot_count;
        /* we generally have at most 6 or so space infos, one for each raid
         * level.  So, a whole page should be more than enough for everyone
         */
@@ -1921,27 +2120,34 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
        dest_orig = dest;
        /* now we have a buffer to copy into */
-        rcu_read_lock();
+        for (i = 0; i < num_types; i++) {
-        list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
+                struct btrfs_space_info *tmp;
-                /* make sure we don't copy more than we allocated
-                 * in our buffer
+                info = NULL;
-                 */
+                rcu_read_lock();
-                if (slot_count == 0)
+                list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
-                        break;
+                                        list) {
-                slot_count--;
+                        if (tmp->flags == types[i]) {
+                                info = tmp;
-                /* make sure userland has enough room in their buffer */
+                                break;
-                if (space_args.total_spaces >= space_args.space_slots)
+                        }
-                        break;
+                }
+                rcu_read_unlock();
-                space.flags = info->flags;
+                if (!info)
-                space.total_bytes = info->total_bytes;
+                        continue;
-                space.used_bytes = info->bytes_used;
+                down_read(&info->groups_sem);
-                memcpy(dest, &space, sizeof(space));
+                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
-                dest++;
+                        if (!list_empty(&info->block_groups[c])) {
-                space_args.total_spaces++;
+                                get_block_group_info(&info->block_groups[c],
+                                                     &space);
+                                memcpy(dest, &space, sizeof(space));
+                                dest++;
+                                space_args.total_spaces++;
+                        }
+                }
+                up_read(&info->groups_sem);
        }
-        rcu_read_unlock();
        user_dest = (struct btrfs_ioctl_space_info *)
                (arg + sizeof(struct btrfs_ioctl_space_args));
@@ -1984,6 +2190,36 @@ long btrfs_ioctl_trans_end(struct file *file)
        return 0;
 }
+static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+{
+        struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
+        struct btrfs_trans_handle *trans;
+        u64 transid;
+        trans = btrfs_start_transaction(root, 0);
+        transid = trans->transid;
+        btrfs_commit_transaction_async(trans, root, 0);
+        if (argp)
+                if (copy_to_user(argp, &transid, sizeof(transid)))
+                        return -EFAULT;
+        return 0;
+}
+static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+{
+        struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
+        u64 transid;
+        if (argp) {
+                if (copy_from_user(&transid, argp, sizeof(transid)))
+                        return -EFAULT;
+        } else {
+                transid = 0;  /* current trans */
+        }
+        return btrfs_wait_for_commit(root, transid);
+}
 long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
 {
@@ -1998,9 +2234,11 @@ long btrfs_ioctl(struct file *file, unsigned int
        case FS_IOC_GETVERSION:
                return btrfs_ioctl_getversion(file, argp);
        case BTRFS_IOC_SNAP_CREATE:
-                return btrfs_ioctl_snap_create(file, argp, 0);
+                return btrfs_ioctl_snap_create(file, argp, 0, 0);
+        case BTRFS_IOC_SNAP_CREATE_ASYNC:
+                return btrfs_ioctl_snap_create(file, argp, 0, 1);
        case BTRFS_IOC_SUBVOL_CREATE:
-                return btrfs_ioctl_snap_create(file, argp, 1);
+                return btrfs_ioctl_snap_create(file, argp, 1, 0);
        case BTRFS_IOC_SNAP_DESTROY:
                return btrfs_ioctl_snap_destroy(file, argp);
        case BTRFS_IOC_DEFAULT_SUBVOL:
@@ -2034,6 +2272,10 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_SYNC:
                btrfs_sync_fs(file->f_dentry->d_sb, 1);
                return 0;
+        case BTRFS_IOC_START_SYNC:
+                return btrfs_ioctl_start_sync(file, argp);
+        case BTRFS_IOC_WAIT_SYNC:
+                return btrfs_ioctl_wait_sync(file, argp);
        }
        return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 424694aa517f..17c99ebdf960 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,14 +22,21 @@
 #define BTRFS_IOCTL_MAGIC 0x94
 #define BTRFS_VOL_NAME_MAX 255
-#define BTRFS_PATH_NAME_MAX 4087
 /* this should be 4k */
+#define BTRFS_PATH_NAME_MAX 4087
 struct btrfs_ioctl_vol_args {
        __s64 fd;
        char name[BTRFS_PATH_NAME_MAX + 1];
 };
+#define BTRFS_SNAPSHOT_NAME_MAX 4079
+struct btrfs_ioctl_async_vol_args {
+        __s64 fd;
+        __u64 transid;
+        char name[BTRFS_SNAPSHOT_NAME_MAX + 1];
+};
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
 struct btrfs_ioctl_ino_lookup_args {
        __u64 treeid;
@@ -178,4 +185,8 @@ struct btrfs_ioctl_space_args {
 #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
 #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
                                    struct btrfs_ioctl_space_args)
+#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
+#define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
+#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \
+                                   struct btrfs_ioctl_async_vol_args)
 #endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e56c72bc5add..f4621f6deca1 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -526,7 +526,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
        u64 end;
        u64 orig_end;
-        u64 wait_end;
        struct btrfs_ordered_extent *ordered;
        int found;
@@ -537,7 +536,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
                if (orig_end > INT_LIMIT(loff_t))
                        orig_end = INT_LIMIT(loff_t);
        }
-        wait_end = orig_end;
 again:
        /* start IO across the range first to instantiate any delalloc
         * extents
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b37d723b9d4a..045c9c2b2d7e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -29,6 +29,7 @@
 #include "locking.h"
 #include "btrfs_inode.h"
 #include "async-thread.h"
+#include "free-space-cache.h"
 /*
 * backref_node, mapping_node and tree_block start with this
@@ -178,8 +179,6 @@ struct reloc_control {
        u64 search_start;
        u64 extents_found;
-        int block_rsv_retries;
        unsigned int stage:8;
        unsigned int create_reloc_tree:1;
        unsigned int merge_reloc_tree:1;
@@ -2133,7 +2132,6 @@ int prepare_to_merge(struct reloc_control *rc, int err)
        LIST_HEAD(reloc_roots);
        u64 num_bytes = 0;
        int ret;
-        int retries = 0;
        mutex_lock(&root->fs_info->trans_mutex);
        rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
@@ -2143,7 +2141,7 @@ again:
        if (!err) {
                num_bytes = rc->merging_rsv_size;
                ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
-                                          num_bytes, &retries);
+                                          num_bytes);
                if (ret)
                        err = ret;
        }
@@ -2155,7 +2153,6 @@ again:
                        btrfs_end_transaction(trans, rc->extent_root);
                        btrfs_block_rsv_release(rc->extent_root,
                                                rc->block_rsv, num_bytes);
-                        retries = 0;
                        goto again;
                }
        }
@@ -2405,15 +2402,13 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
        trans->block_rsv = rc->block_rsv;
-        ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
+        ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
-                                  &rc->block_rsv_retries);
        if (ret) {
                if (ret == -EAGAIN)
                        rc->commit_transaction = 1;
                return ret;
        }
-        rc->block_rsv_retries = 0;
        return 0;
 }
@@ -3099,6 +3094,8 @@ static int add_tree_block(struct reloc_control *rc,
                BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
                ret = get_ref_objectid_v0(rc, path, extent_key,
                                          &ref_owner, NULL);
+                if (ret < 0)
+                        return ret;
                BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
                level = (int)ref_owner;
                /* FIXME: get real generation */
@@ -3191,6 +3188,54 @@ static int block_use_full_backref(struct reloc_control *rc,
        return ret;
 }
+static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
+                                    struct inode *inode, u64 ino)
+{
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct btrfs_root *root = fs_info->tree_root;
+        struct btrfs_trans_handle *trans;
+        unsigned long nr;
+        int ret = 0;
+        if (inode)
+                goto truncate;
+        key.objectid = ino;
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+        if (!inode || IS_ERR(inode) || is_bad_inode(inode)) {
+                if (inode && !IS_ERR(inode))
+                        iput(inode);
+                return -ENOENT;
+        }
+truncate:
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        trans = btrfs_join_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                btrfs_free_path(path);
+                goto out;
+        }
+        ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+        btrfs_free_path(path);
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+out:
+        iput(inode);
+        return ret;
+}
 /*
 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
 * this function scans fs tree to find blocks reference the data extent
@@ -3217,15 +3262,27 @@ static int find_data_references(struct reloc_control *rc,
        int counted;
        int ret;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
        ref_root = btrfs_extent_data_ref_root(leaf, ref);
        ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
        ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
        ref_count = btrfs_extent_data_ref_count(leaf, ref);
+        /*
+         * This is an extent belonging to the free space cache, lets just delete
+         * it and redo the search.
+         */
+        if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
+                ret = delete_block_group_cache(rc->extent_root->fs_info,
+                                               NULL, ref_objectid);
+                if (ret != -ENOENT)
+                        return ret;
+                ret = 0;
+        }
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        root = read_fs_root(rc->extent_root->fs_info, ref_root);
        if (IS_ERR(root)) {
                err = PTR_ERR(root);
@@ -3554,8 +3611,7 @@ int prepare_to_relocate(struct reloc_control *rc)
         * is no reservation in transaction handle.
         */
        ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
-                                  rc->extent_root->nodesize * 256,
+                                  rc->extent_root->nodesize * 256);
-                                  &rc->block_rsv_retries);
        if (ret)
                return ret;
@@ -3567,7 +3623,6 @@ int prepare_to_relocate(struct reloc_control *rc)
        rc->extents_found = 0;
        rc->nodes_relocated = 0;
        rc->merging_rsv_size = 0;
-        rc->block_rsv_retries = 0;
        rc->create_reloc_tree = 1;
        set_reloc_control(rc);
@@ -3860,6 +3915,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 {
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
        struct reloc_control *rc;
+        struct inode *inode;
+        struct btrfs_path *path;
        int ret;
        int rw = 0;
        int err = 0;
@@ -3882,6 +3939,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                rw = 1;
        }
+        path = btrfs_alloc_path();
+        if (!path) {
+                err = -ENOMEM;
+                goto out;
+        }
+        inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
+                                        path);
+        btrfs_free_path(path);
+        if (!IS_ERR(inode))
+                ret = delete_block_group_cache(fs_info, inode, 0);
+        else
+                ret = PTR_ERR(inode);
+        if (ret && ret != -ENOENT) {
+                err = ret;
+                goto out;
+        }
        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
        if (IS_ERR(rc->data_inode)) {
                err = PTR_ERR(rc->data_inode);
@@ -4143,7 +4220,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
                btrfs_add_ordered_sum(inode, ordered, sums);
        }
        btrfs_put_ordered_extent(ordered);
-        return 0;
+        return ret;
 }
 void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 2d958be761c8..6a1086e83ffc 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -181,7 +181,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
 {
        struct btrfs_root *dead_root;
-        struct btrfs_item *item;
        struct btrfs_root_item *ri;
        struct btrfs_key key;
        struct btrfs_key found_key;
@@ -214,7 +213,6 @@ again:
                        nritems = btrfs_header_nritems(leaf);
                        slot = path->slots[0];
                }
-                item = btrfs_item_nr(leaf, slot);
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
                        goto next;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1776dbd8dc98..8299a25ffc8f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,6 +61,8 @@ static void btrfs_put_super(struct super_block *sb)
        ret = close_ctree(root);
        sb->s_fs_info = NULL;
+        (void)ret; /* FIXME: need to fix VFS to return error? */
 }
 enum {
@@ -68,7 +70,8 @@ enum {
        Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
        Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-        Opt_discard, Opt_err,
+        Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
+        Opt_user_subvol_rm_allowed,
 };
 static match_table_t tokens = {
@@ -92,6 +95,9 @@ static match_table_t tokens = {
        {Opt_flushoncommit, "flushoncommit"},
        {Opt_ratio, "metadata_ratio=%d"},
        {Opt_discard, "discard"},
+        {Opt_space_cache, "space_cache"},
+        {Opt_clear_cache, "clear_cache"},
+        {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
        {Opt_err, NULL},
 };
@@ -235,6 +241,16 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_discard:
                        btrfs_set_opt(info->mount_opt, DISCARD);
                        break;
+                case Opt_space_cache:
+                        printk(KERN_INFO "btrfs: enabling disk space caching\n");
+                        btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+                case Opt_clear_cache:
+                        printk(KERN_INFO "btrfs: force clearing of disk cache\n");
+                        btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
+                        break;
+                case Opt_user_subvol_rm_allowed:
+                        btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+                        break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@ -380,7 +396,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 find_root:
        new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
        if (IS_ERR(new_root))
-                return ERR_PTR(PTR_ERR(new_root));
+                return ERR_CAST(new_root);
        if (btrfs_root_refs(&new_root->root_item) == 0)
                return ERR_PTR(-ENOENT);
@@ -436,7 +452,6 @@ static int btrfs_fill_super(struct super_block *sb,
 {
        struct inode *inode;
        struct dentry *root_dentry;
-        struct btrfs_super_block *disk_super;
        struct btrfs_root *tree_root;
        struct btrfs_key key;
        int err;
@@ -458,7 +473,6 @@ static int btrfs_fill_super(struct super_block *sb,
                return PTR_ERR(tree_root);
        }
        sb->s_fs_info = tree_root;
-        disk_super = &tree_root->fs_info->super_copy;
        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
        key.type = BTRFS_INODE_ITEM_KEY;
@@ -560,8 +574,8 @@ static int btrfs_test_super(struct super_block *s, void *data)
 * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
 *        for multiple device setup.  Make sure to keep it in sync.
 */
-static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
-                const char *dev_name, void *data, struct vfsmount *mnt)
+                const char *dev_name, void *data)
 {
        struct block_device *bdev = NULL;
        struct super_block *s;
@@ -571,7 +585,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
        char *subvol_name = NULL;
        u64 subvol_objectid = 0;
        int error = 0;
-        int found = 0;
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
@@ -580,7 +593,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                                          &subvol_name, &subvol_objectid,
                                          &fs_devices);
        if (error)
-                return error;
+                return ERR_PTR(error);
        error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
        if (error)
@@ -607,7 +620,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                        goto error_close_devices;
                }
-                found = 1;
                btrfs_close_devices(fs_devices);
        } else {
                char b[BDEVNAME_SIZE];
@@ -629,7 +641,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
        if (IS_ERR(root)) {
                error = PTR_ERR(root);
                deactivate_locked_super(s);
-                goto error;
+                goto error_free_subvol_name;
        }
        /* if they gave us a subvolume name bind mount into that */
        if (strcmp(subvol_name, ".")) {
@@ -643,24 +655,21 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                        deactivate_locked_super(s);
                        error = PTR_ERR(new_root);
                        dput(root);
-                        goto error_close_devices;
+                        goto error_free_subvol_name;
                }
                if (!new_root->d_inode) {
                        dput(root);
                        dput(new_root);
                        deactivate_locked_super(s);
                        error = -ENXIO;
-                        goto error_close_devices;
+                        goto error_free_subvol_name;
                }
                dput(root);
                root = new_root;
        }
-        mnt->mnt_sb = s;
-        mnt->mnt_root = root;
        kfree(subvol_name);
-        return 0;
+        return root;
 error_s:
        error = PTR_ERR(s);
@@ -668,8 +677,7 @@ error_close_devices:
        btrfs_close_devices(fs_devices);
 error_free_subvol_name:
        kfree(subvol_name);
-error:
+        return ERR_PTR(error);
-        return error;
 }
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -716,18 +724,25 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
+        u64 total_used_data = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
        rcu_read_lock();
-        list_for_each_entry_rcu(found, head, list)
+        list_for_each_entry_rcu(found, head, list) {
+                if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
+                                    BTRFS_BLOCK_GROUP_SYSTEM))
+                        total_used_data += found->disk_total;
+                else
+                        total_used_data += found->disk_used;
                total_used += found->disk_used;
+        }
        rcu_read_unlock();
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-        buf->f_bavail = buf->f_bfree;
+        buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
@@ -746,7 +761,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static struct file_system_type btrfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "btrfs",
-        .get_sb         = btrfs_get_sb,
+        .mount          = btrfs_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -815,6 +830,7 @@ static const struct file_operations btrfs_ctl_fops = {
        .unlocked_ioctl  = btrfs_control_ioctl,
        .compat_ioctl = btrfs_control_ioctl,
        .owner   = THIS_MODULE,
+        .llseek = noop_llseek,
 };
 static struct miscdevice btrfs_misc = {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 66e4c66cc63b..1fffbc017bdf 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -163,6 +163,7 @@ enum btrfs_trans_type {
        TRANS_START,
        TRANS_JOIN,
        TRANS_USERSPACE,
+        TRANS_JOIN_NOLOCK,
 };
 static int may_wait_transaction(struct btrfs_root *root, int type)
@@ -179,14 +180,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 {
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
-        int retries = 0;
        int ret;
 again:
        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
        if (!h)
                return ERR_PTR(-ENOMEM);
-        mutex_lock(&root->fs_info->trans_mutex);
+        if (type != TRANS_JOIN_NOLOCK)
+                mutex_lock(&root->fs_info->trans_mutex);
        if (may_wait_transaction(root, type))
                wait_current_trans(root);
@@ -195,7 +196,8 @@ again:
        cur_trans = root->fs_info->running_transaction;
        cur_trans->use_count++;
-        mutex_unlock(&root->fs_info->trans_mutex);
+        if (type != TRANS_JOIN_NOLOCK)
+                mutex_unlock(&root->fs_info->trans_mutex);
        h->transid = cur_trans->transid;
        h->transaction = cur_trans;
@@ -212,8 +214,7 @@ again:
        }
        if (num_items > 0) {
-                ret = btrfs_trans_reserve_metadata(h, root, num_items,
+                ret = btrfs_trans_reserve_metadata(h, root, num_items);
-                                                   &retries);
                if (ret == -EAGAIN) {
                        btrfs_commit_transaction(h, root);
                        goto again;
@@ -224,9 +225,11 @@ again:
                }
        }
-        mutex_lock(&root->fs_info->trans_mutex);
+        if (type != TRANS_JOIN_NOLOCK)
+                mutex_lock(&root->fs_info->trans_mutex);
        record_root_in_trans(h, root);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        if (type != TRANS_JOIN_NOLOCK)
+                mutex_unlock(&root->fs_info->trans_mutex);
        if (!current->journal_info && type != TRANS_USERSPACE)
                current->journal_info = h;
@@ -244,6 +247,12 @@ struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
        return start_transaction(root, 0, TRANS_JOIN);
 }
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+                                                          int num_blocks)
+{
+        return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+}
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
                                                         int num_blocks)
 {
@@ -270,6 +279,58 @@ static noinline int wait_for_commit(struct btrfs_root *root,
        return 0;
 }
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
+{
+        struct btrfs_transaction *cur_trans = NULL, *t;
+        int ret;
+        mutex_lock(&root->fs_info->trans_mutex);
+        ret = 0;
+        if (transid) {
+                if (transid <= root->fs_info->last_trans_committed)
+                        goto out_unlock;
+                /* find specified transaction */
+                list_for_each_entry(t, &root->fs_info->trans_list, list) {
+                        if (t->transid == transid) {
+                                cur_trans = t;
+                                break;
+                        }
+                        if (t->transid > transid)
+                                break;
+                }
+                ret = -EINVAL;
+                if (!cur_trans)
+                        goto out_unlock;  /* bad transid */
+        } else {
+                /* find newest transaction that is committing | committed */
+                list_for_each_entry_reverse(t, &root->fs_info->trans_list,
+                                            list) {
+                        if (t->in_commit) {
+                                if (t->commit_done)
+                                        goto out_unlock;
+                                cur_trans = t;
+                                break;
+                        }
+                }
+                if (!cur_trans)
+                        goto out_unlock;  /* nothing committing|committed */
+        }
+        cur_trans->use_count++;
+        mutex_unlock(&root->fs_info->trans_mutex);
+        wait_for_commit(root, cur_trans);
+        mutex_lock(&root->fs_info->trans_mutex);
+        put_transaction(cur_trans);
+        ret = 0;
+out_unlock:
+        mutex_unlock(&root->fs_info->trans_mutex);
+        return ret;
+}
 #if 0
 /*
 * rate limit against the drop_snapshot code.  This helps to slow down new
@@ -348,7 +409,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 }
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, int throttle)
+                          struct btrfs_root *root, int throttle, int lock)
 {
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *info = root->fs_info;
@@ -376,26 +437,29 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        btrfs_trans_release_metadata(trans, root);
-        if (!root->fs_info->open_ioctl_trans &&
+        if (lock && !root->fs_info->open_ioctl_trans &&
            should_end_transaction(trans, root))
                trans->transaction->blocked = 1;
-        if (cur_trans->blocked && !cur_trans->in_commit) {
+        if (lock && cur_trans->blocked && !cur_trans->in_commit) {
                if (throttle)
                        return btrfs_commit_transaction(trans, root);
                else
                        wake_up_process(info->transaction_kthread);
        }
-        mutex_lock(&info->trans_mutex);
+        if (lock)
+                mutex_lock(&info->trans_mutex);
        WARN_ON(cur_trans != info->running_transaction);
        WARN_ON(cur_trans->num_writers < 1);
        cur_trans->num_writers--;
+        smp_mb();
        if (waitqueue_active(&cur_trans->writer_wait))
                wake_up(&cur_trans->writer_wait);
        put_transaction(cur_trans);
-        mutex_unlock(&info->trans_mutex);
+        if (lock)
+                mutex_unlock(&info->trans_mutex);
        if (current->journal_info == trans)
                current->journal_info = NULL;
@@ -411,13 +475,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root)
 {
-        return __btrfs_end_transaction(trans, root, 0);
+        return __btrfs_end_transaction(trans, root, 0, 1);
 }
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
-        return __btrfs_end_transaction(trans, root, 1);
+        return __btrfs_end_transaction(trans, root, 1, 1);
+}
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root)
+{
+        return __btrfs_end_transaction(trans, root, 0, 0);
 }
 /*
@@ -836,7 +906,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct extent_buffer *tmp;
        struct extent_buffer *old;
        int ret;
-        int retries = 0;
        u64 to_reserve = 0;
        u64 index = 0;
        u64 objectid;
@@ -858,7 +927,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        if (to_reserve > 0) {
                ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
-                                          to_reserve, &retries);
+                                          to_reserve);
                if (ret) {
                        pending->error = ret;
                        goto fail;
@@ -966,6 +1035,8 @@ static void update_super_roots(struct btrfs_root *root)
        super->root = root_item->bytenr;
        super->generation = root_item->generation;
        super->root_level = root_item->level;
+        if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+                super->cache_generation = root_item->generation;
 }
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -988,11 +1059,127 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
        return ret;
 }
+/*
+ * wait for the current transaction commit to start and block subsequent
+ * transaction joins
+ */
+static void wait_current_trans_commit_start(struct btrfs_root *root,
+                                            struct btrfs_transaction *trans)
+{
+        DEFINE_WAIT(wait);
+        if (trans->in_commit)
+                return;
+        while (1) {
+                prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                if (trans->in_commit) {
+                        finish_wait(&root->fs_info->transaction_blocked_wait,
+                                    &wait);
+                        break;
+                }
+                mutex_unlock(&root->fs_info->trans_mutex);
+                schedule();
+                mutex_lock(&root->fs_info->trans_mutex);
+                finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
+        }
+}
+/*
+ * wait for the current transaction to start and then become unblocked.
+ * caller holds ref.
+ */
+static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
+                                         struct btrfs_transaction *trans)
+{
+        DEFINE_WAIT(wait);
+        if (trans->commit_done || (trans->in_commit && !trans->blocked))
+                return;
+        while (1) {
+                prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                if (trans->commit_done ||
+                    (trans->in_commit && !trans->blocked)) {
+                        finish_wait(&root->fs_info->transaction_wait,
+                                    &wait);
+                        break;
+                }
+                mutex_unlock(&root->fs_info->trans_mutex);
+                schedule();
+                mutex_lock(&root->fs_info->trans_mutex);
+                finish_wait(&root->fs_info->transaction_wait,
+                            &wait);
+        }
+}
+/*
+ * commit transactions asynchronously. once btrfs_commit_transaction_async
+ * returns, any subsequent transaction will not be allowed to join.
+ */
+struct btrfs_async_commit {
+        struct btrfs_trans_handle *newtrans;
+        struct btrfs_root *root;
+        struct delayed_work work;
+};
+static void do_async_commit(struct work_struct *work)
+{
+        struct btrfs_async_commit *ac =
+                container_of(work, struct btrfs_async_commit, work.work);
+        btrfs_commit_transaction(ac->newtrans, ac->root);
+        kfree(ac);
+}
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   int wait_for_unblock)
+{
+        struct btrfs_async_commit *ac;
+        struct btrfs_transaction *cur_trans;
+        ac = kmalloc(sizeof(*ac), GFP_NOFS);
+        BUG_ON(!ac);
+        INIT_DELAYED_WORK(&ac->work, do_async_commit);
+        ac->root = root;
+        ac->newtrans = btrfs_join_transaction(root, 0);
+        /* take transaction reference */
+        mutex_lock(&root->fs_info->trans_mutex);
+        cur_trans = trans->transaction;
+        cur_trans->use_count++;
+        mutex_unlock(&root->fs_info->trans_mutex);
+        btrfs_end_transaction(trans, root);
+        schedule_delayed_work(&ac->work, 0);
+        /* wait for transaction to start and unblock */
+        mutex_lock(&root->fs_info->trans_mutex);
+        if (wait_for_unblock)
+                wait_current_trans_commit_start_and_unblock(root, cur_trans);
+        else
+                wait_current_trans_commit_start(root, cur_trans);
+        put_transaction(cur_trans);
+        mutex_unlock(&root->fs_info->trans_mutex);
+        return 0;
+}
+/*
+ * btrfs_transaction state sequence:
+ *    in_commit = 0, blocked = 0  (initial)
+ *    in_commit = 1, blocked = 1
+ *    blocked = 0
+ *    commit_done = 1
+ */
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
        unsigned long joined = 0;
-        unsigned long timeout = 1;
        struct btrfs_transaction *cur_trans;
        struct btrfs_transaction *prev_trans = NULL;
        DEFINE_WAIT(wait);
@@ -1039,6 +1226,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        trans->transaction->in_commit = 1;
        trans->transaction->blocked = 1;
+        wake_up(&root->fs_info->transaction_blocked_wait);
        if (cur_trans->list.prev != &root->fs_info->trans_list) {
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
@@ -1063,11 +1252,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                        snap_pending = 1;
                WARN_ON(cur_trans != trans->transaction);
-                if (cur_trans->num_writers > 1)
-                        timeout = MAX_SCHEDULE_TIMEOUT;
-                else if (should_grow)
-                        timeout = 1;
                mutex_unlock(&root->fs_info->trans_mutex);
                if (flush_on_commit || snap_pending) {
@@ -1089,8 +1273,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                                TASK_UNINTERRUPTIBLE);
                smp_mb();
-                if (cur_trans->num_writers > 1 || should_grow)
+                if (cur_trans->num_writers > 1)
-                        schedule_timeout(timeout);
+                        schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+                else if (should_grow)
+                        schedule_timeout(1);
                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&cur_trans->writer_wait, &wait);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e104986d0bfd..f104b57ad4ef 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -87,12 +87,17 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root);
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
                                                  int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+                                                          int num_blocks);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
                                                         int num_blocks);
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root);
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -104,6 +109,9 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root);
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   int wait_for_unblock);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root);
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f7ac8e013ed7..992ab425599d 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -36,7 +36,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
        int ret = 0;
        int wret;
        int level;
-        int orig_level;
        int is_extent = 0;
        int next_key_ret = 0;
        u64 last_ret = 0;
@@ -64,7 +63,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        level = btrfs_header_level(root->node);
-        orig_level = level;
        if (level == 0)
                goto out;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fb102a9aee9c..a29f19384a27 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -786,7 +786,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 {
        struct inode *dir;
        int ret;
-        struct btrfs_key location;
        struct btrfs_inode_ref *ref;
        struct btrfs_dir_item *di;
        struct inode *inode;
@@ -795,10 +794,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
        unsigned long ref_ptr;
        unsigned long ref_end;
-        location.objectid = key->objectid;
-        location.type = BTRFS_INODE_ITEM_KEY;
-        location.offset = 0;
        /*
         * it is possible that we didn't log all the parent directories
         * for a given inode.  If we don't find the dir, just don't
@@ -1583,7 +1578,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
        struct btrfs_path *path;
        struct btrfs_root *root = wc->replay_dest;
        struct btrfs_key key;
-        u32 item_size;
        int level;
        int i;
        int ret;
@@ -1601,7 +1595,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
        nritems = btrfs_header_nritems(eb);
        for (i = 0; i < nritems; i++) {
                btrfs_item_key_to_cpu(eb, &key, i);
-                item_size = btrfs_item_size_nr(eb, i);
                /* inode keys are done during the first stage */
                if (key.type == BTRFS_INODE_ITEM_KEY &&
@@ -1668,7 +1661,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                                   struct walk_control *wc)
 {
        u64 root_owner;
-        u64 root_gen;
        u64 bytenr;
        u64 ptr_gen;
        struct extent_buffer *next;
@@ -1698,7 +1690,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                parent = path->nodes[*level];
                root_owner = btrfs_header_owner(parent);
-                root_gen = btrfs_header_generation(parent);
                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
@@ -1749,7 +1740,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                 struct walk_control *wc)
 {
        u64 root_owner;
-        u64 root_gen;
        int i;
        int slot;
        int ret;
@@ -1757,8 +1747,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
                slot = path->slots[i];
                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
-                        struct extent_buffer *node;
-                        node = path->nodes[i];
                        path->slots[i]++;
                        *level = i;
                        WARN_ON(*level == 0);
@@ -1771,7 +1759,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                parent = path->nodes[*level + 1];
                        root_owner = btrfs_header_owner(parent);
-                        root_gen = btrfs_header_generation(parent);
                        wc->process_func(root, path->nodes[*level], wc,
                                 btrfs_header_generation(path->nodes[*level]));
                        if (wc->free) {
@@ -2273,7 +2260,7 @@ fail:
        }
        btrfs_end_log_trans(root);
-        return 0;
+        return err;
 }
 /* see comments for btrfs_del_dir_entries_in_log */
@@ -2729,7 +2716,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        struct btrfs_key max_key;
        struct btrfs_root *log = root->log_root;
        struct extent_buffer *src = NULL;
-        u32 size;
        int err = 0;
        int ret;
        int nritems;
@@ -2793,7 +2779,6 @@ again:
                        break;
                src = path->nodes[0];
-                size = btrfs_item_size_nr(src, path->slots[0]);
                if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
                        ins_nr++;
                        goto next_slot;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd318ff280b2..cc04dc1445d6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -398,7 +398,6 @@ static noinline int device_list_add(const char *path,
                device->work.func = pending_bios_fn;
                memcpy(device->uuid, disk_super->dev_item.uuid,
                       BTRFS_UUID_SIZE);
-                device->barriers = 1;
                spin_lock_init(&device->io_lock);
                device->name = kstrdup(path, GFP_NOFS);
                if (!device->name) {
@@ -462,7 +461,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
                device->devid = orig_dev->devid;
                device->work.func = pending_bios_fn;
                memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
-                device->barriers = 1;
                spin_lock_init(&device->io_lock);
                INIT_LIST_HEAD(&device->dev_list);
                INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -1489,7 +1487,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        trans = btrfs_start_transaction(root, 0);
        lock_chunks(root);
-        device->barriers = 1;
        device->writeable = 1;
        device->work.func = pending_bios_fn;
        generate_random_uuid(device->uuid);
@@ -1901,7 +1898,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
        u64 size_to_free;
        struct btrfs_path *path;
        struct btrfs_key key;
-        struct btrfs_chunk *chunk;
        struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
        struct btrfs_trans_handle *trans;
        struct btrfs_key found_key;
@@ -1965,9 +1961,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
                if (found_key.objectid != key.objectid)
                        break;
-                chunk = btrfs_item_ptr(path->nodes[0],
-                                       path->slots[0],
-                                       struct btrfs_chunk);
                /* chunk zero is special */
                if (found_key.offset == 0)
                        break;
@@ -3034,8 +3027,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                }
                bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
                dev = multi->stripes[dev_nr].dev;
-                BUG_ON(rw == WRITE && !dev->writeable);
+                if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
-                if (dev && dev->bdev) {
                        bio->bi_bdev = dev->bdev;
                        if (async_submit)
                                schedule_bio(root, dev, rw, bio);
@@ -3084,7 +3076,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
                return NULL;
        list_add(&device->dev_list,
                 &fs_devices->devices);
-        device->barriers = 1;
        device->dev_root = root->fs_info->dev_root;
        device->devid = devid;
        device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2ea..2b638b6e4eea 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -42,7 +42,6 @@ struct btrfs_device {
        int running_pending;
        u64 generation;
-        int barriers;
        int writeable;
        int in_fs_metadata;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 88ecbb215878..698fdd2c739c 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -178,7 +178,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
        struct inode *inode = dentry->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_path *path;
-        struct btrfs_item *item;
        struct extent_buffer *leaf;
        struct btrfs_dir_item *di;
        int ret = 0, slot, advance;
@@ -234,7 +233,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                }
                advance = 1;
-                item = btrfs_item_nr(leaf, slot);
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
                /* check to make sure this item is what we want */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 3e2b90eaa239..b9cd5445f71c 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -199,8 +199,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        int nr_pages = 0;
        struct page *in_page = NULL;
        struct page *out_page = NULL;
-        int out_written = 0;
-        int in_read = 0;
        unsigned long bytes_left;
        *out_pages = 0;
@@ -233,9 +231,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
        workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
-        out_written = 0;
-        in_read = 0;
        while (workspace->def_strm.total_in < len) {
                ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
                if (ret != Z_OK) {
diff --git a/fs/buffer.c b/fs/buffer.c
index 3e7dca279d1c..5930e382959b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -156,7 +156,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
+                if (!quiet_error(bh)) {
                        buffer_io_error(bh);
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
@@ -905,7 +905,6 @@ try_again:
                bh->b_state = 0;
                atomic_set(&bh->b_count, 0);
-                bh->b_private = NULL;
                bh->b_size = size;
                /* Link the buffer to its page */
@@ -1706,7 +1705,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
                 * and kswapd activity, but those code paths have their own
                 * higher-level throttling.
                 */
-                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else if (!trylock_buffer(bh)) {
                        redirty_page_for_writepage(wbc, page);
@@ -1834,9 +1833,11 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
-int block_prepare_write(struct page *page, unsigned from, unsigned to,
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block)
 {
+        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+        unsigned to = from + len;
        struct inode *inode = page->mapping->host;
        unsigned block_start, block_end;
        sector_t block;
@@ -1916,7 +1917,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
        }
        return err;
 }
-EXPORT_SYMBOL(block_prepare_write);
+EXPORT_SYMBOL(__block_write_begin);
 static int __block_commit_write(struct inode *inode, struct page *page,
                unsigned from, unsigned to)
@@ -1953,15 +1954,6 @@ static int __block_commit_write(struct inode *inode, struct page *page,
        return 0;
 }
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-                get_block_t *get_block)
-{
-        unsigned start = pos & (PAGE_CACHE_SIZE - 1);
-        return block_prepare_write(page, start, start + len, get_block);
-}
-EXPORT_SYMBOL(__block_write_begin);
 /*
 * block_write_begin takes care of the basic task of block allocation and
 * bringing partial write blocks uptodate first.
@@ -2379,7 +2371,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        else
                end = PAGE_CACHE_SIZE;
-        ret = block_prepare_write(page, 0, end, get_block);
+        ret = __block_write_begin(page, 0, end, get_block);
        if (!ret)
                ret = block_commit_write(page, 0, end);
@@ -2466,11 +2458,10 @@ int nobh_write_begin(struct address_space *mapping,
        *fsdata = NULL;
        if (page_has_buffers(page)) {
-                unlock_page(page);
+                ret = __block_write_begin(page, pos, len, get_block);
-                page_cache_release(page);
+                if (unlikely(ret))
-                *pagep = NULL;
+                        goto out_release;
-                return block_write_begin(mapping, pos, len, flags, pagep,
+                return ret;
-                                         get_block);
        }
        if (PageMappedToDisk(page))
@@ -2891,7 +2882,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
        if (err == -EOPNOTSUPP) {
                set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-                set_bit(BH_Eopnotsupp, &bh->b_state);
        }
        if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
@@ -3031,10 +3021,6 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw)
                bh->b_end_io = end_buffer_write_sync;
                ret = submit_bh(rw, bh);
                wait_on_buffer(bh);
-                if (buffer_eopnotsupp(bh)) {
-                        clear_buffer_eopnotsupp(bh);
-                        ret = -EOPNOTSUPP;
-                }
                if (!ret && !buffer_uptodate(bh))
                        ret = -EIO;
        } else {
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 727caedcdd92..0a1467b15516 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -55,6 +55,7 @@ const struct file_operations cachefiles_daemon_fops = {
        .read           = cachefiles_daemon_read,
        .write          = cachefiles_daemon_write,
        .poll           = cachefiles_daemon_poll,
+        .llseek         = noop_llseek,
 };
 struct cachefiles_daemon_cmd {
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 51bcc5ce3230..e9c874abc9e1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -591,7 +591,6 @@ static int ceph_writepages_start(struct address_space *mapping,
                                 struct writeback_control *wbc)
 {
        struct inode *inode = mapping->host;
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc;
        pgoff_t index, start, end;
@@ -633,13 +632,6 @@ static int ceph_writepages_start(struct address_space *mapping,
        pagevec_init(&pvec, 0);
-        /* ?? */
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                dout(" writepages congested\n");
-                wbc->encountered_congestion = 1;
-                goto out_final;
-        }
        /* where to start/end? */
        if (wbc->range_cyclic) {
                start = mapping->writeback_index; /* Start from prev offset */
@@ -885,7 +877,6 @@ out:
                rc = 0;  /* vfs expects us to return 0 */
        ceph_put_snap_context(snapc);
        dout("writepages done, rc = %d\n", rc);
-out_final:
        return rc;
 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d6e0e0421891..08b460ae0539 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -635,7 +635,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 /*
 * mount: join the ceph cluster, and open root directory.
 */
-static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
                      const char *path)
 {
        int err;
@@ -678,16 +678,14 @@ static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
                }
        }
-        mnt->mnt_root = root;
-        mnt->mnt_sb = fsc->sb;
        fsc->mount_state = CEPH_MOUNT_MOUNTED;
        dout("mount success\n");
-        err = 0;
+        mutex_unlock(&fsc->client->mount_mutex);
+        return root;
 out:
        mutex_unlock(&fsc->client->mount_mutex);
-        return err;
+        return ERR_PTR(err);
 fail:
        if (first) {
@@ -777,41 +775,45 @@ static int ceph_register_bdi(struct super_block *sb,
        return err;
 }
-static int ceph_get_sb(struct file_system_type *fs_type,
+static struct dentry *ceph_mount(struct file_system_type *fs_type,
-                       int flags, const char *dev_name, void *data,
+                       int flags, const char *dev_name, void *data)
-                       struct vfsmount *mnt)
 {
        struct super_block *sb;
        struct ceph_fs_client *fsc;
+        struct dentry *res;
        int err;
        int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
        const char *path = NULL;
        struct ceph_mount_options *fsopt = NULL;
        struct ceph_options *opt = NULL;
-        dout("ceph_get_sb\n");
+        dout("ceph_mount\n");
        err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
-        if (err < 0)
+        if (err < 0) {
+                res = ERR_PTR(err);
                goto out_final;
+        }
        /* create client (which we may/may not use) */
        fsc = create_fs_client(fsopt, opt);
        if (IS_ERR(fsc)) {
-                err = PTR_ERR(fsc);
+                res = ERR_CAST(fsc);
                kfree(fsopt);
                kfree(opt);
                goto out_final;
        }
        err = ceph_mdsc_init(fsc);
-        if (err < 0)
+        if (err < 0) {
+                res = ERR_PTR(err);
                goto out;
+        }
        if (ceph_test_opt(fsc->client, NOSHARE))
                compare_super = NULL;
        sb = sget(fs_type, compare_super, ceph_set_super, fsc);
        if (IS_ERR(sb)) {
-                err = PTR_ERR(sb);
+                res = ERR_CAST(sb);
                goto out;
        }
@@ -823,16 +825,18 @@ static int ceph_get_sb(struct file_system_type *fs_type,
        } else {
                dout("get_sb using new client %p\n", fsc);
                err = ceph_register_bdi(sb, fsc);
-                if (err < 0)
+                if (err < 0) {
+                        res = ERR_PTR(err);
                        goto out_splat;
+                }
        }
-        err = ceph_mount(fsc, mnt, path);
+        res = ceph_real_mount(fsc, path);
-        if (err < 0)
+        if (IS_ERR(res))
                goto out_splat;
-        dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
+        dout("root %p inode %p ino %llx.%llx\n", res,
-             mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
+             res->d_inode, ceph_vinop(res->d_inode));
-        return 0;
+        return res;
 out_splat:
        ceph_mdsc_close_sessions(fsc->mdsc);
@@ -843,8 +847,8 @@ out:
        ceph_mdsc_destroy(fsc);
        destroy_fs_client(fsc);
 out_final:
-        dout("ceph_get_sb fail %d\n", err);
+        dout("ceph_mount fail %ld\n", PTR_ERR(res));
-        return err;
+        return res;
 }
 static void ceph_kill_sb(struct super_block *s)
@@ -860,7 +864,7 @@ static void ceph_kill_sb(struct super_block *s)
 static struct file_system_type ceph_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ceph",
-        .get_sb         = ceph_get_sb,
+        .mount          = ceph_mount,
        .kill_sb        = ceph_kill_sb,
        .fs_flags       = FS_RENAME_DOES_D_MOVE,
 };
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 143d393881cb..e5b9df993b93 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -456,6 +456,7 @@ static void cdev_purge(struct cdev *cdev)
 */
 const struct file_operations def_chr_fops = {
        .open = chrdev_open,
+        .llseek = noop_llseek,
 };
 static struct kobject *exact_match(dev_t dev, int *part, void *data)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 917b7d449bb2..0ed213970ced 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,9 @@ config CIFS
        tristate "CIFS support (advanced network filesystem, SMBFS successor)"
        depends on INET
        select NLS
+        select CRYPTO
+        select CRYPTO_MD5
+        select CRYPTO_ARC4
        help
          This is the client VFS module for the Common Internet File System
          (CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/README b/fs/cifs/README
index 7099a526f775..ee68d1036544 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -527,6 +527,11 @@ A partial list of the supported mount options follows:
                SFU does).  In the future the bottom 9 bits of the
                mode also will be emulated using queries of the security
                descriptor (ACL).
+ mfsymlinks     Enable support for Minshall+French symlinks
+                (see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
+                This option is ignored when specified together with the
+                'sfu' option. Minshall+French symlinks are used even if
+                the server supports the CIFS Unix Extensions.
 sign           Must use packet signing (helps avoid unwanted data modification
                by intermediate systems in the route).  Note that signing
                does not work with lanman or plaintext authentication.
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 5aff46c61e52..355abcdcda98 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for
 v) mount check for unmatched uids
-w) Add support for new vfs entry points for setlease and fallocate 
+w) Add support for new vfs entry point for fallocate
 x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 
 processes can proceed better in parallel (on the server)
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index eb1ba493489f..103ab8b605b0 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -148,7 +148,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
        seq_printf(m, "Servers:");
        i = 0;
-        read_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp1, &cifs_tcp_ses_list) {
                server = list_entry(tmp1, struct TCP_Server_Info,
                                    tcp_ses_list);
@@ -230,7 +230,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                        spin_unlock(&GlobalMid_Lock);
                }
        }
-        read_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        seq_putc(m, '\n');
        /* BB add code to dump additional info such as TCP session info now */
@@ -270,7 +270,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
                atomic_set(&totBufAllocCount, 0);
                atomic_set(&totSmBufAllocCount, 0);
 #endif /* CONFIG_CIFS_STATS2 */
-                read_lock(&cifs_tcp_ses_lock);
+                spin_lock(&cifs_tcp_ses_lock);
                list_for_each(tmp1, &cifs_tcp_ses_list) {
                        server = list_entry(tmp1, struct TCP_Server_Info,
                                            tcp_ses_list);
@@ -303,7 +303,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
                                }
                        }
                }
-                read_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
        }
        return count;
@@ -343,7 +343,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
                GlobalCurrentXid, GlobalMaxActiveXid);
        i = 0;
-        read_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp1, &cifs_tcp_ses_list) {
                server = list_entry(tmp1, struct TCP_Server_Info,
                                    tcp_ses_list);
@@ -397,7 +397,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
                        }
                }
        }
-        read_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        seq_putc(m, '\n');
        return 0;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index aa316891ac0c..8942b28cf807 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -34,7 +34,7 @@ void cifs_dump_mids(struct TCP_Server_Info *);
 extern int traceSMB;            /* flag which enables the function below */
 void dump_smb(struct smb_hdr *, int);
 #define CIFS_INFO       0x01
-#define CIFS_RC         0x02
+#define CIFS_RC         0x02
 #define CIFS_TIMER      0x04
 /*
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d6ced7aa23cf..c68a056f27fd 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -44,8 +44,7 @@ static void cifs_dfs_expire_automounts(struct work_struct *work)
 void cifs_dfs_release_automount_timer(void)
 {
        BUG_ON(!list_empty(&cifs_dfs_automount_list));
-        cancel_delayed_work(&cifs_dfs_automount_task);
+        cancel_delayed_work_sync(&cifs_dfs_automount_task);
-        flush_scheduled_work();
 }
 /**
@@ -306,6 +305,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        int xid, i;
        int rc = 0;
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
+        struct tcon_link *tlink;
        cFYI(1, "in %s", __func__);
        BUG_ON(IS_ROOT(dentry));
@@ -315,14 +315,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        dput(nd->path.dentry);
        nd->path.dentry = dget(dentry);
-        cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
-        ses = cifs_sb->tcon->ses;
-        if (!ses) {
-                rc = -EINVAL;
-                goto out_err;
-        }
        /*
         * The MSDFS spec states that paths in DFS referral requests and
         * responses must be prefixed by a single '\' character instead of
@@ -335,10 +327,20 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
                goto out_err;
        }
-        rc = get_dfs_path(xid, ses , full_path + 1, cifs_sb->local_nls,
+        cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
+                rc = PTR_ERR(tlink);
+                goto out_err;
+        }
+        ses = tlink_tcon(tlink)->ses;
+        rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
                &num_referrals, &referrals,
                cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        cifs_put_tlink(tlink);
        for (i = 0; i < num_referrals; i++) {
                int len;
                dump_referral(referrals+i);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 9e771450c3b8..e9a393c9c2ca 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,6 +15,8 @@
 *   the GNU Lesser General Public License for more details.
 *
 */
+#include <linux/rbtree.h>
 #ifndef _CIFS_FS_SB_H
 #define _CIFS_FS_SB_H
@@ -36,23 +38,28 @@
 #define CIFS_MOUNT_NOPOSIXBRL   0x2000 /* mandatory not posix byte range lock */
 #define CIFS_MOUNT_NOSSYNC      0x4000 /* don't do slow SMBflush on every sync*/
 #define CIFS_MOUNT_FSCACHE      0x8000 /* local caching enabled */
+#define CIFS_MOUNT_MF_SYMLINKS  0x10000 /* Minshall+French Symlinks enabled */
+#define CIFS_MOUNT_MULTIUSER    0x20000 /* multiuser mount */
 struct cifs_sb_info {
-        struct cifsTconInfo *tcon;      /* primary mount */
+        struct rb_root tlink_tree;
-        struct list_head nested_tcon_q;
+        spinlock_t tlink_tree_lock;
+        struct tcon_link *master_tlink;
        struct nls_table *local_nls;
        unsigned int rsize;
        unsigned int wsize;
+        atomic_t active;
        uid_t   mnt_uid;
        gid_t   mnt_gid;
        mode_t  mnt_file_mode;
        mode_t  mnt_dir_mode;
-        int     mnt_cifs_flags;
+        unsigned int mnt_cifs_flags;
        int     prepathlen;
        char   *prepath; /* relative path under the share to mount to */
 #ifdef CONFIG_CIFS_DFS_UPCALL
        char   *mountdata; /* mount options received at mount time */
 #endif
        struct backing_dev_info bdi;
+        struct delayed_work prune_tlinks;
 };
 #endif                          /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 85d7cf7ff2c8..c9b4792ae825 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -557,11 +557,16 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
 {
        struct cifs_ntsd *pntsd = NULL;
        int xid, rc;
+        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return NULL;
        xid = GetXid();
-        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+        rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
        return pntsd;
@@ -574,10 +579,16 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
        int oplock = 0;
        int xid, rc;
        __u16 fid;
+        struct cifsTconInfo *tcon;
+        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return NULL;
+        tcon = tlink_tcon(tlink);
        xid = GetXid();
-        rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0,
+        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
                         &fid, &oplock, NULL, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
@@ -585,11 +596,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
                goto out;
        }
-        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+        rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
-        CIFSSMBClose(xid, cifs_sb->tcon, fid);
+        CIFSSMBClose(xid, tcon, fid);
 out:
+        cifs_put_tlink(tlink);
        FreeXid(xid);
        return pntsd;
 }
@@ -603,7 +615,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
        struct cifsFileInfo *open_file = NULL;
        if (inode)
-                open_file = find_readable_file(CIFS_I(inode));
+                open_file = find_readable_file(CIFS_I(inode), true);
        if (!open_file)
                return get_cifs_acl_by_path(cifs_sb, path, pacllen);
@@ -616,10 +628,15 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
                struct cifs_ntsd *pnntsd, u32 acllen)
 {
        int xid, rc;
+        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
        xid = GetXid();
-        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+        rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        cFYI(DBG2, "SetCIFSACL rc = %d", rc);
        return rc;
@@ -631,10 +648,16 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
        int oplock = 0;
        int xid, rc;
        __u16 fid;
+        struct cifsTconInfo *tcon;
+        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
        xid = GetXid();
-        rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0,
+        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
                         &fid, &oplock, NULL, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
@@ -642,12 +665,13 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
                goto out;
        }
-        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+        rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
        cFYI(DBG2, "SetCIFSACL rc = %d", rc);
-        CIFSSMBClose(xid, cifs_sb->tcon, fid);
+        CIFSSMBClose(xid, tcon, fid);
- out:
+out:
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -661,7 +685,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
        cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
-        open_file = find_readable_file(CIFS_I(inode));
+        open_file = find_readable_file(CIFS_I(inode), true);
        if (!open_file)
                return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 35042d8f7338..f856732161ab 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -27,6 +27,7 @@
 #include "md5.h"
 #include "cifs_unicode.h"
 #include "cifsproto.h"
+#include "ntlmssp.h"
 #include <linux/ctype.h>
 #include <linux/random.h>
@@ -42,18 +43,32 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
                       unsigned char *p24);
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
-                                    const struct mac_key *key, char *signature)
+                                struct TCP_Server_Info *server, char *signature)
 {
-        struct  MD5Context context;
+        int rc;
-        if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
+        if (cifs_pdu == NULL || signature == NULL || server == NULL)
                return -EINVAL;
-        cifs_MD5_init(&context);
+        if (!server->secmech.sdescmd5) {
-        cifs_MD5_update(&context, (char *)&key->data, key->len);
+                cERROR(1, "%s: Can't generate signature\n", __func__);
-        cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+                return -1;
+        }
+        rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+        if (rc) {
+                cERROR(1, "%s: Oould not init md5\n", __func__);
+                return rc;
+        }
+        crypto_shash_update(&server->secmech.sdescmd5->shash,
+                server->session_key.response, server->session_key.len);
+        crypto_shash_update(&server->secmech.sdescmd5->shash,
+                cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+        rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
-        cifs_MD5_final(signature, &context);
        return 0;
 }
@@ -78,8 +93,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
        server->sequence_number++;
        spin_unlock(&GlobalMid_Lock);
-        rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key,
+        rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
-                                      smb_signature);
        if (rc)
                memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
        else
@@ -89,16 +103,28 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 }
 static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
-                                const struct mac_key *key, char *signature)
+                                struct TCP_Server_Info *server, char *signature)
 {
-        struct  MD5Context context;
        int i;
+        int rc;
-        if ((iov == NULL) || (signature == NULL) || (key == NULL))
+        if (iov == NULL || signature == NULL || server == NULL)
                return -EINVAL;
-        cifs_MD5_init(&context);
+        if (!server->secmech.sdescmd5) {
-        cifs_MD5_update(&context, (char *)&key->data, key->len);
+                cERROR(1, "%s: Can't generate signature\n", __func__);
+                return -1;
+        }
+        rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
+        if (rc) {
+                cERROR(1, "%s: Oould not init md5\n", __func__);
+                return rc;
+        }
+        crypto_shash_update(&server->secmech.sdescmd5->shash,
+                server->session_key.response, server->session_key.len);
        for (i = 0; i < n_vec; i++) {
                if (iov[i].iov_len == 0)
                        continue;
@@ -111,18 +137,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
                if (i == 0) {
                        if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
                                break; /* nothing to sign or corrupt header */
-                        cifs_MD5_update(&context, iov[0].iov_base+4,
+                        crypto_shash_update(&server->secmech.sdescmd5->shash,
-                                  iov[0].iov_len-4);
+                                iov[i].iov_base + 4, iov[i].iov_len - 4);
                } else
-                        cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
+                        crypto_shash_update(&server->secmech.sdescmd5->shash,
+                                iov[i].iov_base, iov[i].iov_len);
        }
-        cifs_MD5_final(signature, &context);
+        rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
-        return 0;
+        return rc;
 }
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
                   __u32 *pexpected_response_sequence_number)
 {
@@ -145,8 +171,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
        server->sequence_number++;
        spin_unlock(&GlobalMid_Lock);
-        rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key,
+        rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
-                                      smb_signature);
        if (rc)
                memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
        else
@@ -156,14 +181,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 }
 int cifs_verify_signature(struct smb_hdr *cifs_pdu,
-                          const struct mac_key *mac_key,
+                          struct TCP_Server_Info *server,
                          __u32 expected_sequence_number)
 {
        unsigned int rc;
        char server_response_sig[8];
        char what_we_think_sig_should_be[20];
-        if ((cifs_pdu == NULL) || (mac_key == NULL))
+        if (cifs_pdu == NULL || server == NULL)
                return -EINVAL;
        if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -192,7 +217,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
                                        cpu_to_le32(expected_sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
-        rc = cifs_calculate_signature(cifs_pdu, mac_key,
+        rc = cifs_calculate_signature(cifs_pdu, server,
                what_we_think_sig_should_be);
        if (rc)
@@ -208,18 +233,28 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 }
-/* We fill in key by putting in 40 byte array which was allocated by caller */
+/* first calculate 24 bytes ntlm response and then 16 byte session key */
-int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
+int setup_ntlm_response(struct cifsSesInfo *ses)
-                           const char *password)
 {
-        char temp_key[16];
+        unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
-        if ((key == NULL) || (rn == NULL))
+        char temp_key[CIFS_SESS_KEY_SIZE];
+        if (!ses)
                return -EINVAL;
-        E_md4hash(password, temp_key);
+        ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
-        mdfour(key->data.ntlm, temp_key, 16);
+        if (!ses->auth_key.response) {
-        memcpy(key->data.ntlm+16, rn, CIFS_SESS_KEY_SIZE);
+                cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len);
-        key->len = 40;
+                return -ENOMEM;
+        }
+        ses->auth_key.len = temp_len;
+        SMBNTencrypt(ses->password, ses->server->cryptkey,
+                        ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+        E_md4hash(ses->password, temp_key);
+        mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
        return 0;
 }
@@ -262,109 +297,457 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 }
 #endif /* CIFS_WEAK_PW_HASH */
-static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
+/* Build a proper attribute value/target info pairs blob.
+ * Fill in netbios and dns domain name and workstation name
+ * and client time (total five av pairs and + one end of fields indicator.
+ * Allocate domain name which gets freed when session struct is deallocated.
+ */
+static int
+build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+{
+        unsigned int dlen;
+        unsigned int wlen;
+        unsigned int size = 6 * sizeof(struct ntlmssp2_name);
+        __le64  curtime;
+        char *defdmname = "WORKGROUP";
+        unsigned char *blobptr;
+        struct ntlmssp2_name *attrptr;
+        if (!ses->domainName) {
+                ses->domainName = kstrdup(defdmname, GFP_KERNEL);
+                if (!ses->domainName)
+                        return -ENOMEM;
+        }
+        dlen = strlen(ses->domainName);
+        wlen = strlen(ses->server->hostname);
+        /* The length of this blob is a size which is
+         * six times the size of a structure which holds name/size +
+         * two times the unicode length of a domain name +
+         * two times the unicode length of a server name +
+         * size of a timestamp (which is 8 bytes).
+         */
+        ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
+        ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
+        if (!ses->auth_key.response) {
+                ses->auth_key.len = 0;
+                cERROR(1, "Challenge target info allocation failure");
+                return -ENOMEM;
+        }
+        blobptr = ses->auth_key.response;
+        attrptr = (struct ntlmssp2_name *) blobptr;
+        attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
+        attrptr->length = cpu_to_le16(2 * dlen);
+        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+        cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+        blobptr += 2 * dlen;
+        attrptr = (struct ntlmssp2_name *) blobptr;
+        attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
+        attrptr->length = cpu_to_le16(2 * wlen);
+        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+        cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
+        blobptr += 2 * wlen;
+        attrptr = (struct ntlmssp2_name *) blobptr;
+        attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
+        attrptr->length = cpu_to_le16(2 * dlen);
+        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+        cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+        blobptr += 2 * dlen;
+        attrptr = (struct ntlmssp2_name *) blobptr;
+        attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
+        attrptr->length = cpu_to_le16(2 * wlen);
+        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+        cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
+        blobptr += 2 * wlen;
+        attrptr = (struct ntlmssp2_name *) blobptr;
+        attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
+        attrptr->length = cpu_to_le16(sizeof(__le64));
+        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
+        curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+        memcpy(blobptr, &curtime, sizeof(__le64));
+        return 0;
+}
+/* Server has provided av pairs/target info in the type 2 challenge
+ * packet and we have plucked it and stored within smb session.
+ * We parse that blob here to find netbios domain name to be used
+ * as part of ntlmv2 authentication (in Target String), if not already
+ * specified on the command line.
+ * If this function returns without any error but without fetching
+ * domain name, authentication may fail against some server but
+ * may not fail against other (those who are not very particular
+ * about target string i.e. for some, just user name might suffice.
+ */
+static int
+find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
+{
+        unsigned int attrsize;
+        unsigned int type;
+        unsigned int onesize = sizeof(struct ntlmssp2_name);
+        unsigned char *blobptr;
+        unsigned char *blobend;
+        struct ntlmssp2_name *attrptr;
+        if (!ses->auth_key.len || !ses->auth_key.response)
+                return 0;
+        blobptr = ses->auth_key.response;
+        blobend = blobptr + ses->auth_key.len;
+        while (blobptr + onesize < blobend) {
+                attrptr = (struct ntlmssp2_name *) blobptr;
+                type = le16_to_cpu(attrptr->type);
+                if (type == NTLMSSP_AV_EOL)
+                        break;
+                blobptr += 2; /* advance attr type */
+                attrsize = le16_to_cpu(attrptr->length);
+                blobptr += 2; /* advance attr size */
+                if (blobptr + attrsize > blobend)
+                        break;
+                if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
+                        if (!attrsize)
+                                break;
+                        if (!ses->domainName) {
+                                ses->domainName =
+                                        kmalloc(attrsize + 1, GFP_KERNEL);
+                                if (!ses->domainName)
+                                                return -ENOMEM;
+                                cifs_from_ucs2(ses->domainName,
+                                        (__le16 *)blobptr, attrsize, attrsize,
+                                        nls_cp, false);
+                                break;
+                        }
+                }
+                blobptr += attrsize; /* advance attr  value */
+        }
+        return 0;
+}
+static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
                            const struct nls_table *nls_cp)
 {
        int rc = 0;
        int len;
-        char nt_hash[16];
+        char nt_hash[CIFS_NTHASH_SIZE];
-        struct HMACMD5Context *pctxt;
        wchar_t *user;
        wchar_t *domain;
+        wchar_t *server;
-        pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL);
+        if (!ses->server->secmech.sdeschmacmd5) {
+                cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
-        if (pctxt == NULL)
+                return -1;
-                return -ENOMEM;
+        }
        /* calculate md4 hash of password */
        E_md4hash(ses->password, nt_hash);
-        /* convert Domainname to unicode and uppercase */
+        crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
-        hmac_md5_init_limK_to_64(nt_hash, 16, pctxt);
+                                CIFS_NTHASH_SIZE);
+        rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+        if (rc) {
+                cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
+                return rc;
+        }
        /* convert ses->userName to unicode and uppercase */
        len = strlen(ses->userName);
        user = kmalloc(2 + (len * 2), GFP_KERNEL);
-        if (user == NULL)
+        if (user == NULL) {
+                cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
+                rc = -ENOMEM;
                goto calc_exit_2;
+        }
        len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
        UniStrupr(user);
-        hmac_md5_update((char *)user, 2*len, pctxt);
+        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+                                (char *)user, 2 * len);
        /* convert ses->domainName to unicode and uppercase */
        if (ses->domainName) {
                len = strlen(ses->domainName);
                domain = kmalloc(2 + (len * 2), GFP_KERNEL);
-                if (domain == NULL)
+                if (domain == NULL) {
+                        cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
+                        rc = -ENOMEM;
                        goto calc_exit_1;
+                }
                len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
                                        nls_cp);
-                /* the following line was removed since it didn't work well
+                crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
-                   with lower cased domain name that passed as an option.
+                                        (char *)domain, 2 * len);
-                   Maybe converting the domain name earlier makes sense */
-                /* UniStrupr(domain); */
-                hmac_md5_update((char *)domain, 2*len, pctxt);
                kfree(domain);
+        } else if (ses->serverName) {
+                len = strlen(ses->serverName);
+                server = kmalloc(2 + (len * 2), GFP_KERNEL);
+                if (server == NULL) {
+                        cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
+                        rc = -ENOMEM;
+                        goto calc_exit_1;
+                }
+                len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+                                        nls_cp);
+                crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+                                        (char *)server, 2 * len);
+                kfree(server);
        }
+        rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+                                        ntlmv2_hash);
 calc_exit_1:
        kfree(user);
 calc_exit_2:
-        /* BB FIXME what about bytes 24 through 40 of the signing key?
+        return rc;
-           compare with the NTLM example */
+}
-        hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
+static int
+CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash)
+{
+        int rc;
+        unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
+        if (!ses->server->secmech.sdeschmacmd5) {
+                cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+                return -1;
+        }
+        crypto_shash_setkey(ses->server->secmech.hmacmd5,
+                                ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+        rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+        if (rc) {
+                cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
+                return rc;
+        }
+        if (ses->server->secType == RawNTLMSSP)
+                memcpy(ses->auth_key.response + offset,
+                        ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+        else
+                memcpy(ses->auth_key.response + offset,
+                        ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+                ses->auth_key.response + offset, ses->auth_key.len - offset);
+        rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+                ses->auth_key.response + CIFS_SESS_KEY_SIZE);
-        kfree(pctxt);
        return rc;
 }
-void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
-                      const struct nls_table *nls_cp)
+int
+setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp)
 {
        int rc;
-        struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf;
+        int baselen;
-        struct HMACMD5Context context;
+        unsigned int tilen;
+        struct ntlmv2_resp *buf;
+        char ntlmv2_hash[16];
+        unsigned char *tiblob = NULL; /* target info blob */
+        if (ses->server->secType == RawNTLMSSP) {
+                if (!ses->domainName) {
+                        rc = find_domain_name(ses, nls_cp);
+                        if (rc) {
+                                cERROR(1, "error %d finding domain name", rc);
+                                goto setup_ntlmv2_rsp_ret;
+                        }
+                }
+        } else {
+                rc = build_avpair_blob(ses, nls_cp);
+                if (rc) {
+                        cERROR(1, "error %d building av pair blob", rc);
+                        goto setup_ntlmv2_rsp_ret;
+                }
+        }
+        baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
+        tilen = ses->auth_key.len;
+        tiblob = ses->auth_key.response;
+        ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
+        if (!ses->auth_key.response) {
+                rc = ENOMEM;
+                ses->auth_key.len = 0;
+                cERROR(1, "%s: Can't allocate auth blob", __func__);
+                goto setup_ntlmv2_rsp_ret;
+        }
+        ses->auth_key.len += baselen;
+        buf = (struct ntlmv2_resp *)
+                        (ses->auth_key.response + CIFS_SESS_KEY_SIZE);
        buf->blob_signature = cpu_to_le32(0x00000101);
        buf->reserved = 0;
        buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
        get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
        buf->reserved2 = 0;
-        buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
-        buf->names[0].length = 0;
-        buf->names[1].type = 0;
-        buf->names[1].length = 0;
-        /* calculate buf->ntlmv2_hash */
+        memcpy(ses->auth_key.response + baselen, tiblob, tilen);
-        rc = calc_ntlmv2_hash(ses, nls_cp);
-        if (rc)
+        /* calculate ntlmv2_hash */
+        rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
+        if (rc) {
                cERROR(1, "could not get v2 hash rc %d", rc);
-        CalcNTLMv2_response(ses, resp_buf);
+                goto setup_ntlmv2_rsp_ret;
+        }
+        /* calculate first part of the client response (CR1) */
+        rc = CalcNTLMv2_response(ses, ntlmv2_hash);
+        if (rc) {
+                cERROR(1, "Could not calculate CR1  rc: %d", rc);
+                goto setup_ntlmv2_rsp_ret;
+        }
+        /* now calculate the session key for NTLMv2 */
+        crypto_shash_setkey(ses->server->secmech.hmacmd5,
+                ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+        rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
+        if (rc) {
+                cERROR(1, "%s: Could not init hmacmd5\n", __func__);
+                goto setup_ntlmv2_rsp_ret;
+        }
+        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+                ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+                CIFS_HMAC_MD5_HASH_SIZE);
+        rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
+                ses->auth_key.response);
+setup_ntlmv2_rsp_ret:
+        kfree(tiblob);
+        return rc;
+}
-        /* now calculate the MAC key for NTLMv2 */
+int
-        hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
+calc_seckey(struct cifsSesInfo *ses)
-        hmac_md5_update(resp_buf, 16, &context);
+{
-        hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context);
+        int rc;
+        struct crypto_blkcipher *tfm_arc4;
+        struct scatterlist sgin, sgout;
+        struct blkcipher_desc desc;
+        unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
+        get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
+        tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+        if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
+                cERROR(1, "could not allocate crypto API arc4\n");
+                return PTR_ERR(tfm_arc4);
+        }
+        desc.tfm = tfm_arc4;
+        crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
+                                        CIFS_SESS_KEY_SIZE);
+        sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
+        sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
+        rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
+        if (rc) {
+                cERROR(1, "could not encrypt session key rc: %d\n", rc);
+                crypto_free_blkcipher(tfm_arc4);
+                return rc;
+        }
+        /* make secondary_key/nonce as session key */
+        memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE);
+        /* and make len as that of session key only */
+        ses->auth_key.len = CIFS_SESS_KEY_SIZE;
-        memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf,
+        crypto_free_blkcipher(tfm_arc4);
-               sizeof(struct ntlmv2_resp));
-        ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp);
+        return 0;
 }
-void CalcNTLMv2_response(const struct cifsSesInfo *ses,
+void
-                         char *v2_session_response)
+cifs_crypto_shash_release(struct TCP_Server_Info *server)
 {
-        struct HMACMD5Context context;
+        if (server->secmech.md5)
-        /* rest of v2 struct already generated */
+                crypto_free_shash(server->secmech.md5);
-        memcpy(v2_session_response + 8, ses->server->cryptKey, 8);
-        hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
-        hmac_md5_update(v2_session_response+8,
+        if (server->secmech.hmacmd5)
-                        sizeof(struct ntlmv2_resp) - 8, &context);
+                crypto_free_shash(server->secmech.hmacmd5);
-        hmac_md5_final(v2_session_response, &context);
+        kfree(server->secmech.sdeschmacmd5);
-/*      cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */
+        kfree(server->secmech.sdescmd5);
+}
+int
+cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+        int rc;
+        unsigned int size;
+        server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
+        if (!server->secmech.hmacmd5 ||
+                        IS_ERR(server->secmech.hmacmd5)) {
+                cERROR(1, "could not allocate crypto hmacmd5\n");
+                return PTR_ERR(server->secmech.hmacmd5);
+        }
+        server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
+        if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) {
+                cERROR(1, "could not allocate crypto md5\n");
+                rc = PTR_ERR(server->secmech.md5);
+                goto crypto_allocate_md5_fail;
+        }
+        size = sizeof(struct shash_desc) +
+                        crypto_shash_descsize(server->secmech.hmacmd5);
+        server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+        if (!server->secmech.sdeschmacmd5) {
+                cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
+                rc = -ENOMEM;
+                goto crypto_allocate_hmacmd5_sdesc_fail;
+        }
+        server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
+        server->secmech.sdeschmacmd5->shash.flags = 0x0;
+        size = sizeof(struct shash_desc) +
+                        crypto_shash_descsize(server->secmech.md5);
+        server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
+        if (!server->secmech.sdescmd5) {
+                cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
+                rc = -ENOMEM;
+                goto crypto_allocate_md5_sdesc_fail;
+        }
+        server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
+        server->secmech.sdescmd5->shash.flags = 0x0;
+        return 0;
+crypto_allocate_md5_sdesc_fail:
+        kfree(server->secmech.sdeschmacmd5);
+crypto_allocate_hmacmd5_sdesc_fail:
+        crypto_free_shash(server->secmech.md5);
+crypto_allocate_md5_fail:
+        crypto_free_shash(server->secmech.hmacmd5);
+        return rc;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b7431afdd76d..9c3789762ab7 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -35,7 +35,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-#include <linux/smp_lock.h>
+#include <net/ipv6.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #define DECLARE_GLOBALS_HERE
@@ -82,6 +82,24 @@ extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
+void
+cifs_sb_active(struct super_block *sb)
+{
+        struct cifs_sb_info *server = CIFS_SB(sb);
+        if (atomic_inc_return(&server->active) == 1)
+                atomic_inc(&sb->s_active);
+}
+void
+cifs_sb_deactive(struct super_block *sb)
+{
+        struct cifs_sb_info *server = CIFS_SB(sb);
+        if (atomic_dec_and_test(&server->active))
+                deactivate_super(sb);
+}
 static int
 cifs_read_super(struct super_block *sb, void *data,
                const char *devname, int silent)
@@ -97,6 +115,9 @@ cifs_read_super(struct super_block *sb, void *data,
        if (cifs_sb == NULL)
                return -ENOMEM;
+        spin_lock_init(&cifs_sb->tlink_tree_lock);
+        cifs_sb->tlink_tree = RB_ROOT;
        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
        if (rc) {
                kfree(cifs_sb);
@@ -136,9 +157,6 @@ cifs_read_super(struct super_block *sb, void *data,
        sb->s_magic = CIFS_MAGIC_NUMBER;
        sb->s_op = &cifs_super_ops;
        sb->s_bdi = &cifs_sb->bdi;
-/*      if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
-            sb->s_blocksize =
-                cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
        sb->s_blocksize = CIFS_MAX_MSGSIZE;
        sb->s_blocksize_bits = 14;      /* default 2**14 = CIFS_MAX_MSGSIZE */
        inode = cifs_root_iget(sb, ROOT_I);
@@ -200,8 +218,6 @@ cifs_put_super(struct super_block *sb)
                return;
        }
-        lock_kernel();
        rc = cifs_umount(sb, cifs_sb);
        if (rc)
                cERROR(1, "cifs_umount failed with return code %d", rc);
@@ -215,8 +231,6 @@ cifs_put_super(struct super_block *sb)
        unload_nls(cifs_sb->local_nls);
        bdi_destroy(&cifs_sb->bdi);
        kfree(cifs_sb);
-        unlock_kernel();
 }
 static int
@@ -224,7 +238,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        int rc = -EOPNOTSUPP;
        int xid;
@@ -304,12 +318,10 @@ cifs_alloc_inode(struct super_block *sb)
                return NULL;
        cifs_inode->cifsAttrs = 0x20;   /* default */
        cifs_inode->time = 0;
-        cifs_inode->write_behind_rc = 0;
        /* Until the file is open and we have gotten oplock
        info back from the server, can not assume caching of
        file data or metadata */
-        cifs_inode->clientCanCacheRead = false;
+        cifs_set_oplock_level(cifs_inode, 0);
-        cifs_inode->clientCanCacheAll = false;
        cifs_inode->delete_pending = false;
        cifs_inode->invalid_mapping = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
@@ -366,14 +378,36 @@ static int
 cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
+        struct sockaddr *srcaddr;
+        srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
        seq_printf(s, ",unc=%s", tcon->treeName);
-        if (tcon->ses->userName)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
+                seq_printf(s, ",multiuser");
+        else if (tcon->ses->userName)
                seq_printf(s, ",username=%s", tcon->ses->userName);
        if (tcon->ses->domainName)
                seq_printf(s, ",domain=%s", tcon->ses->domainName);
+        if (srcaddr->sa_family != AF_UNSPEC) {
+                struct sockaddr_in *saddr4;
+                struct sockaddr_in6 *saddr6;
+                saddr4 = (struct sockaddr_in *)srcaddr;
+                saddr6 = (struct sockaddr_in6 *)srcaddr;
+                if (srcaddr->sa_family == AF_INET6)
+                        seq_printf(s, ",srcaddr=%pI6c",
+                                   &saddr6->sin6_addr);
+                else if (srcaddr->sa_family == AF_INET)
+                        seq_printf(s, ",srcaddr=%pI4",
+                                   &saddr4->sin_addr.s_addr);
+                else
+                        seq_printf(s, ",srcaddr=BAD-AF:%i",
+                                   (int)(srcaddr->sa_family));
+        }
        seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
                seq_printf(s, ",forceuid");
@@ -422,6 +456,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
                seq_printf(s, ",dynperm");
        if (m->mnt_sb->s_flags & MS_POSIXACL)
                seq_printf(s, ",acl");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+                seq_printf(s, ",mfsymlinks");
        seq_printf(s, ",rsize=%d", cifs_sb->rsize);
        seq_printf(s, ",wsize=%d", cifs_sb->wsize);
@@ -437,20 +473,18 @@ static void cifs_umount_begin(struct super_block *sb)
        if (cifs_sb == NULL)
                return;
-        tcon = cifs_sb->tcon;
+        tcon = cifs_sb_master_tcon(cifs_sb);
-        if (tcon == NULL)
-                return;
-        read_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) {
                /* we have other mounts to same share or we have
                   already tried to force umount this and woken up
                   all waiting network requests, nothing to do */
-                read_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                return;
        } else if (tcon->tc_count == 1)
                tcon->tidStatus = CifsExiting;
-        read_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
        /* cancel_notify_requests(tcon); */
@@ -509,28 +543,29 @@ static const struct super_operations cifs_super_ops = {
 #endif
 };
-static int
+static struct dentry *
-cifs_get_sb(struct file_system_type *fs_type,
+cifs_do_mount(struct file_system_type *fs_type,
-            int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+            int flags, const char *dev_name, void *data)
 {
        int rc;
-        struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
+        struct super_block *sb;
+        sb = sget(fs_type, NULL, set_anon_super, NULL);
        cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
        if (IS_ERR(sb))
-                return PTR_ERR(sb);
+                return ERR_CAST(sb);
        sb->s_flags = flags;
        rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
        if (rc) {
                deactivate_locked_super(sb);
-                return rc;
+                return ERR_PTR(rc);
        }
        sb->s_flags |= MS_ACTIVE;
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 }
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
@@ -565,9 +600,10 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
-        /* note that this is called by vfs setlease with the BKL held
+        /* note that this is called by vfs setlease with lock_flocks held
-           although I doubt that BKL is needed here in cifs */
+           to protect *lease from going away */
        struct inode *inode = file->f_path.dentry->d_inode;
+        struct cifsFileInfo *cfile = file->private_data;
        if (!(S_ISREG(inode->i_mode)))
                return -EINVAL;
@@ -578,8 +614,8 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
            ((arg == F_WRLCK) &&
                (CIFS_I(inode)->clientCanCacheAll)))
                return generic_setlease(file, arg, lease);
-        else if (CIFS_SB(inode->i_sb)->tcon->local_lease &&
+        else if (tlink_tcon(cfile->tlink)->local_lease &&
-                        !CIFS_I(inode)->clientCanCacheRead)
+                 !CIFS_I(inode)->clientCanCacheRead)
                /* If the server claims to support oplock on this
                   file, then we still need to check oplock even
                   if the local_lease mount option is set, but there
@@ -595,7 +631,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 struct file_system_type cifs_fs_type = {
        .owner = THIS_MODULE,
        .name = "cifs",
-        .get_sb = cifs_get_sb,
+        .mount = cifs_do_mount,
        .kill_sb = kill_anon_super,
        /*  .fs_flags */
 };
@@ -898,8 +934,8 @@ init_cifs(void)
        GlobalTotalActiveXid = 0;
        GlobalMaxActiveXid = 0;
        memset(Local_System_Name, 0, 15);
-        rwlock_init(&GlobalSMBSeslock);
+        spin_lock_init(&cifs_tcp_ses_lock);
-        rwlock_init(&cifs_tcp_ses_lock);
+        spin_lock_init(&cifs_file_list_lock);
        spin_lock_init(&GlobalMid_Lock);
        if (cifs_max_pending < 2) {
@@ -912,11 +948,11 @@ init_cifs(void)
        rc = cifs_fscache_register();
        if (rc)
-                goto out;
+                goto out_clean_proc;
        rc = cifs_init_inodecache();
        if (rc)
-                goto out_clean_proc;
+                goto out_unreg_fscache;
        rc = cifs_init_mids();
        if (rc)
@@ -938,19 +974,19 @@ init_cifs(void)
        return 0;
 #ifdef CONFIG_CIFS_UPCALL
- out_unregister_filesystem:
+out_unregister_filesystem:
        unregister_filesystem(&cifs_fs_type);
 #endif
- out_destroy_request_bufs:
+out_destroy_request_bufs:
        cifs_destroy_request_bufs();
- out_destroy_mids:
+out_destroy_mids:
        cifs_destroy_mids();
- out_destroy_inodecache:
+out_destroy_inodecache:
        cifs_destroy_inodecache();
- out_clean_proc:
+out_unreg_fscache:
-        cifs_proc_clean();
        cifs_fscache_unregister();
- out:
+out_clean_proc:
+        cifs_proc_clean();
        return rc;
 }
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index d82f5fb4761e..897b2b2b28b5 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -42,10 +42,8 @@ extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
 /* Functions related to super block operations */
-/* extern const struct super_operations cifs_super_ops;*/
+extern void cifs_sb_active(struct super_block *sb);
-extern void cifs_read_inode(struct inode *);
+extern void cifs_sb_deactive(struct super_block *sb);
-/*extern void cifs_delete_inode(struct inode *);*/  /* BB not needed yet */
-/* extern void cifs_write_inode(struct inode *); */ /* BB not needed yet */
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
@@ -104,7 +102,7 @@ extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
 extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
                        const char *symname);
 extern int      cifs_removexattr(struct dentry *, const char *);
-extern int      cifs_setxattr(struct dentry *, const char *, const void *,
+extern int      cifs_setxattr(struct dentry *, const char *, const void *,
                        size_t, int);
 extern ssize_t  cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t  cifs_listxattr(struct dentry *, char *, size_t);
@@ -114,5 +112,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.65"
+#define CIFS_VERSION   "1.68"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0cdfb8c32ac6..b577bf0a1bb3 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -25,6 +25,9 @@
 #include <linux/workqueue.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
+#include <crypto/internal/hash.h>
+#include <linux/scatterlist.h>
 /*
 * The sizes of various internal tables and strings
 */
@@ -74,7 +77,7 @@
 * CIFS vfs client Status information (based on what we know.)
 */
- /* associated with each tcp and smb session */
+/* associated with each tcp and smb session */
 enum statusEnum {
        CifsNew = 0,
        CifsGood,
@@ -97,16 +100,31 @@ enum protocolEnum {
        /* Netbios frames protocol not supported at this time */
 };
-struct mac_key {
+struct session_key {
        unsigned int len;
-        union {
+        char *response;
-                char ntlm[CIFS_SESS_KEY_SIZE + 16];
+};
-                char krb5[CIFS_SESS_KEY_SIZE + 16]; /* BB: length correct? */
-                struct {
+/* crypto security descriptor definition */
-                        char key[16];
+struct sdesc {
-                        struct ntlmv2_resp resp;
+        struct shash_desc shash;
-                } ntlmv2;
+        char ctx[];
-        } data;
+};
+/* crypto hashing related structure/fields, not specific to a sec mech */
+struct cifs_secmech {
+        struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
+        struct crypto_shash *md5; /* md5 hash function */
+        struct sdesc *sdeschmacmd5;  /* ctxt to generate ntlmv2 hash, CR1 */
+        struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
+};
+/* per smb session structure/fields */
+struct ntlmssp_auth {
+        __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
+        __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
+        unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
+        char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */
 };
 struct cifs_cred {
@@ -139,6 +157,7 @@ struct TCP_Server_Info {
                struct sockaddr_in sockAddr;
                struct sockaddr_in6 sockAddr6;
        } addr;
+        struct sockaddr_storage srcaddr; /* locally bind to this IP */
        wait_queue_head_t response_q;
        wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
        struct list_head pending_mid_q;
@@ -178,19 +197,20 @@ struct TCP_Server_Info {
        int capabilities; /* allow selective disabling of caps by smb sess */
        int timeAdj;  /* Adjust for difference in server time zone in sec */
        __u16 CurrentMid;         /* multiplex id - rotating counter */
-        char cryptKey[CIFS_CRYPTO_KEY_SIZE];
+        char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
        /* 16th byte of RFC1001 workstation name is always null */
        char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
        __u32 sequence_number; /* needed for CIFS PDU signature */
-        struct mac_key mac_signing_key;
+        struct session_key session_key;
-        char ntlmv2_hash[16];
        unsigned long lstrp; /* when we got last response from this server */
        u16 dialect; /* dialect index that server chose */
+        struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
        /* extended security flavors that server supports */
        bool    sec_kerberos;           /* supports plain Kerberos */
        bool    sec_mskerberos;         /* supports legacy MS Kerberos */
        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
        bool    sec_ntlmssp;            /* supports NTLMSSP */
+        bool session_estab; /* mark when very first sess is established */
 #ifdef CONFIG_CIFS_FSCACHE
        struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
@@ -222,6 +242,8 @@ struct cifsSesInfo {
        char userName[MAX_USERNAME_SIZE + 1];
        char *domainName;
        char *password;
+        struct session_key auth_key;
+        struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
        bool need_reconnect:1; /* connection reset, uid now invalid */
 };
 /* no more than one of the following three session flags may be set */
@@ -308,6 +330,45 @@ struct cifsTconInfo {
 };
 /*
+ * This is a refcounted and timestamped container for a tcon pointer. The
+ * container holds a tcon reference. It is considered safe to free one of
+ * these when the tl_count goes to 0. The tl_time is the time of the last
+ * "get" on the container.
+ */
+struct tcon_link {
+        struct rb_node          tl_rbnode;
+        uid_t                   tl_uid;
+        unsigned long           tl_flags;
+#define TCON_LINK_MASTER        0
+#define TCON_LINK_PENDING       1
+#define TCON_LINK_IN_TREE       2
+        unsigned long           tl_time;
+        atomic_t                tl_count;
+        struct cifsTconInfo     *tl_tcon;
+};
+extern struct tcon_link *cifs_sb_tlink(struct cifs_sb_info *cifs_sb);
+static inline struct cifsTconInfo *
+tlink_tcon(struct tcon_link *tlink)
+{
+        return tlink->tl_tcon;
+}
+extern void cifs_put_tlink(struct tcon_link *tlink);
+static inline struct tcon_link *
+cifs_get_tlink(struct tcon_link *tlink)
+{
+        if (tlink && !IS_ERR(tlink))
+                atomic_inc(&tlink->tl_count);
+        return tlink;
+}
+/* This function is always expected to succeed */
+extern struct cifsTconInfo *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
+/*
 * This info hangs off the cifsFileInfo structure, pointed to by llist.
 * This is used to track byte stream locks on the file
 */
@@ -345,34 +406,29 @@ struct cifsFileInfo {
        __u16 netfid;           /* file id from remote */
        /* BB add lock scope info here if needed */ ;
        /* lock scope id (0 if none) */
-        struct file *pfile; /* needed for writepage */
+        struct dentry *dentry;
-        struct inode *pInode; /* needed for oplock break */
+        unsigned int f_flags;
-        struct vfsmount *mnt;
+        struct tcon_link *tlink;
        struct mutex lock_mutex;
        struct list_head llist; /* list of byte range locks we have. */
-        bool closePend:1;       /* file is marked to close */
        bool invalidHandle:1;   /* file closed via session abend */
        bool oplock_break_cancelled:1;
-        atomic_t count;         /* reference count */
+        int count;              /* refcount protected by cifs_file_list_lock */
        struct mutex fh_mutex; /* prevents reopen race after dead ses*/
        struct cifs_search_info srch_inf;
        struct work_struct oplock_break; /* work for oplock breaks */
 };
-/* Take a reference on the file private data */
+/*
+ * Take a reference on the file private data. Must be called with
+ * cifs_file_list_lock held.
+ */
 static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
 {
-        atomic_inc(&cifs_file->count);
+        ++cifs_file->count;
 }
-/* Release a reference on the file private data */
+void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
-static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
-{
-        if (atomic_dec_and_test(&cifs_file->count)) {
-                iput(cifs_file->pInode);
-                kfree(cifs_file);
-        }
-}
 /*
 * One of these for each file inode
@@ -382,7 +438,6 @@ struct cifsInodeInfo {
        struct list_head lockList;
        /* BB add in lists for dirty pages i.e. write caching info for oplock */
        struct list_head openFileList;
-        int write_behind_rc;
        __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
        unsigned long time;     /* jiffies of last update/check of inode */
        bool clientCanCacheRead:1;      /* read oplock */
@@ -474,16 +529,16 @@ struct oplock_q_entry {
 /* for pending dnotify requests */
 struct dir_notify_req {
-       struct list_head lhead;
+        struct list_head lhead;
-       __le16 Pid;
+        __le16 Pid;
-       __le16 PidHigh;
+        __le16 PidHigh;
-       __u16 Mid;
+        __u16 Mid;
-       __u16 Tid;
+        __u16 Tid;
-       __u16 Uid;
+        __u16 Uid;
-       __u16 netfid;
+        __u16 netfid;
-       __u32 filter; /* CompletionFilter (for multishot) */
+        __u32 filter; /* CompletionFilter (for multishot) */
-       int multishot;
+        int multishot;
-       struct file *pfile;
+        struct file *pfile;
 };
 struct dfs_info3_param {
@@ -633,7 +688,7 @@ require use of the stronger protocol */
 *  GlobalMid_Lock protects:
 *      list operations on pending_mid_q and oplockQ
 *      updates to XID counters, multiplex id  and SMB sequence numbers
- *  GlobalSMBSesLock protects:
+ *  cifs_file_list_lock protects:
 *      list operations on tcp and SMB session lists and tCon lists
 *  f_owner.lock protects certain per file struct operations
 *  mapping->page_lock protects certain per page operations
@@ -667,7 +722,7 @@ GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
 * the reference counters for the server, smb session, and tcon. Finally,
 * changes to the tcon->tidStatus should be done while holding this lock.
 */
-GLOBAL_EXTERN rwlock_t          cifs_tcp_ses_lock;
+GLOBAL_EXTERN spinlock_t                cifs_tcp_ses_lock;
 /*
 * This lock protects the cifs_file->llist and cifs_file->flist
@@ -676,7 +731,7 @@ GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
 * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
 * the cifs_tcp_ses_lock must be grabbed first and released last.
 */
-GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
+GLOBAL_EXTERN spinlock_t        cifs_file_list_lock;
 /* Outstanding dir notify requests */
 GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 14d036d8db11..de36b09763a8 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -131,9 +131,20 @@
 #define CIFS_CRYPTO_KEY_SIZE (8)
 /*
+ * Size of the ntlm client response
+ */
+#define CIFS_AUTH_RESP_SIZE (24)
+/*
 * Size of the session key (crypto key encrypted with the password
 */
-#define CIFS_SESS_KEY_SIZE (24)
+#define CIFS_SESS_KEY_SIZE (16)
+#define CIFS_CLIENT_CHALLENGE_SIZE (8)
+#define CIFS_SERVER_CHALLENGE_SIZE (8)
+#define CIFS_HMAC_MD5_HASH_SIZE (16)
+#define CIFS_CPHTXT_SIZE (16)
+#define CIFS_NTHASH_SIZE (16)
 /*
 * Maximum user name length
@@ -663,7 +674,6 @@ struct ntlmv2_resp {
        __le64  time;
        __u64  client_chal; /* random */
        __u32  reserved2;
-        struct ntlmssp2_name names[2];
        /* array of name entries could follow ending in minimum 4 byte struct */
 } __attribute__((packed));
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1d60c655e3e0..7ed69b6b5fe6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -78,9 +78,9 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern bool is_valid_oplock_break(struct smb_hdr *smb,
                                  struct TCP_Server_Info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
-extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *);
+extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
+extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
 #endif
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
@@ -104,13 +104,14 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
 extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
                                      int offset);
+extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
-extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
+extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
-                                __u16 fileHandle, struct file *file,
+                                struct file *file, struct tcon_link *tlink,
-                                struct vfsmount *mnt, unsigned int oflags);
+                                __u32 oplock);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
                                struct super_block *sb,
-                                int mode, int oflags,
+                                int mode, unsigned int f_flags,
                                __u32 *poplock, __u16 *pnetfid, int xid);
 void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
 extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
@@ -362,13 +363,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
                          __u32 *);
 extern int cifs_verify_signature(struct smb_hdr *,
-                                 const struct mac_key *mac_key,
+                                 struct TCP_Server_Info *server,
                                __u32 expected_sequence_number);
-extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
+extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
-                                 const char *pass);
+extern int setup_ntlm_response(struct cifsSesInfo *);
-extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
+extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
-extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
+extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
-                             const struct nls_table *);
+extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
+extern int calc_seckey(struct cifsSesInfo *);
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern void calc_lanman_hash(const char *password, const char *cryptkey,
                                bool encrypt, char *lnm_session_key);
@@ -408,4 +411,8 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
                        const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
+extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
+extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+                const unsigned char *path,
+                struct cifs_sb_info *cifs_sb, int xid);
 #endif                  /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7e83b356cc9e..2f2632b6df5a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -91,13 +91,13 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
        struct list_head *tmp1;
 /* list all files open on tree connection and mark them invalid */
-        write_lock(&GlobalSMBSeslock);
+        spin_lock(&cifs_file_list_lock);
        list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
                open_file = list_entry(tmp, struct cifsFileInfo, tlist);
                open_file->invalidHandle = true;
                open_file->oplock_break_cancelled = true;
        }
-        write_unlock(&GlobalSMBSeslock);
+        spin_unlock(&cifs_file_list_lock);
        /* BB Add call to invalidate_inodes(sb) for all superblocks mounted
           to this tcon */
 }
@@ -503,7 +503,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                if (rsp->EncryptionKeyLength ==
                                cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
-                        memcpy(server->cryptKey, rsp->EncryptionKey,
+                        memcpy(ses->server->cryptkey, rsp->EncryptionKey,
                                CIFS_CRYPTO_KEY_SIZE);
                } else if (server->secMode & SECMODE_PW_ENCRYPT) {
                        rc = -EIO; /* need cryptkey unless plain text */
@@ -574,7 +574,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
        server->timeAdj *= 60;
        if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
-                memcpy(server->cryptKey, pSMBr->u.EncryptionKey,
+                memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
                       CIFS_CRYPTO_KEY_SIZE);
        } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
                        && (pSMBr->EncryptionKeyLength == 0)) {
@@ -593,9 +593,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        rc = -EIO;
                        goto neg_err_exit;
                }
-                read_lock(&cifs_tcp_ses_lock);
+                spin_lock(&cifs_tcp_ses_lock);
                if (server->srv_count > 1) {
-                        read_unlock(&cifs_tcp_ses_lock);
+                        spin_unlock(&cifs_tcp_ses_lock);
                        if (memcmp(server->server_GUID,
                                   pSMBr->u.extended_response.
                                   GUID, 16) != 0) {
@@ -605,7 +605,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                                        16);
                        }
                } else {
-                        read_unlock(&cifs_tcp_ses_lock);
+                        spin_unlock(&cifs_tcp_ses_lock);
                        memcpy(server->server_GUID,
                               pSMBr->u.extended_response.GUID, 16);
                }
@@ -620,13 +620,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                                rc = 0;
                        else
                                rc = -EINVAL;
+                        if (server->secType == Kerberos) {
-                        if (server->sec_kerberos || server->sec_mskerberos)
+                                if (!server->sec_kerberos &&
-                                server->secType = Kerberos;
+                                                !server->sec_mskerberos)
-                        else if (server->sec_ntlmssp)
+                                        rc = -EOPNOTSUPP;
-                                server->secType = RawNTLMSSP;
+                        } else if (server->secType == RawNTLMSSP) {
-                        else
+                                if (!server->sec_ntlmssp)
-                                rc = -EOPNOTSUPP;
+                                        rc = -EOPNOTSUPP;
+                        } else
+                                        rc = -EOPNOTSUPP;
                }
        } else
                server->capabilities &= ~CAP_EXTENDED_SECURITY;
diff --git a/fs/cifs/cn_cifs.h b/fs/cifs/cn_cifs.h
deleted file mode 100644
index ea59ccac2eb1..000000000000
--- a/fs/cifs/cn_cifs.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- *   fs/cifs/cn_cifs.h
- *
- *   Copyright (c) International Business Machines  Corp., 2002
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#ifndef _CN_CIFS_H
-#define _CN_CIFS_H
-#ifdef CONFIG_CIFS_UPCALL
-#include <linux/types.h>
-#include <linux/connector.h>
-struct cifs_upcall {
-        char signature[4]; /* CIFS */
-        enum command {
-                CIFS_GET_IP = 0x00000001,   /* get ip address for hostname */
-                CIFS_GET_SECBLOB = 0x00000002, /* get SPNEGO wrapped blob */
-        } command;
-        /* union cifs upcall data follows */
-};
-#endif /* CIFS_UPCALL */
-#endif /* _CN_CIFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 88c84a38bccb..251a17c03545 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -47,7 +47,6 @@
 #include "ntlmssp.h"
 #include "nterr.h"
 #include "rfc1002pdu.h"
-#include "cn_cifs.h"
 #include "fscache.h"
 #define CIFS_PORT 445
@@ -100,16 +99,25 @@ struct smb_vol {
        bool noautotune:1;
        bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
        bool fsc:1;     /* enable fscache */
+        bool mfsymlinks:1; /* use Minshall+French Symlinks */
+        bool multiuser:1;
        unsigned int rsize;
        unsigned int wsize;
        bool sockopt_tcp_nodelay:1;
        unsigned short int port;
        char *prepath;
+        struct sockaddr_storage srcaddr; /* allow binding to a local IP */
        struct nls_table *local_nls;
 };
+/* FIXME: should these be tunable? */
+#define TLINK_ERROR_EXPIRE      (1 * HZ)
+#define TLINK_IDLE_EXPIRE       (600 * HZ)
 static int ipv4_connect(struct TCP_Server_Info *server);
 static int ipv6_connect(struct TCP_Server_Info *server);
+static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
+static void cifs_prune_tlinks(struct work_struct *work);
 /*
 * cifs tcp session reconnection
@@ -143,7 +151,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
        /* before reconnecting the tcp session, mark the smb session (uid)
                and the tid bad so they are not used until reconnected */
-        read_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &server->smb_ses_list) {
                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
                ses->need_reconnect = true;
@@ -153,7 +161,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
                        tcon->need_reconnect = true;
                }
        }
-        read_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        /* do not want to be sending data on a socket we are freeing */
        mutex_lock(&server->srv_mutex);
        if (server->ssocket) {
@@ -166,6 +174,11 @@ cifs_reconnect(struct TCP_Server_Info *server)
                sock_release(server->ssocket);
                server->ssocket = NULL;
        }
+        server->sequence_number = 0;
+        server->session_estab = false;
+        kfree(server->session_key.response);
+        server->session_key.response = NULL;
+        server->session_key.len = 0;
        spin_lock(&GlobalMid_Lock);
        list_for_each(tmp, &server->pending_mid_q) {
@@ -198,7 +211,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
                        spin_lock(&GlobalMid_Lock);
                        if (server->tcpStatus != CifsExiting)
                                server->tcpStatus = CifsGood;
-                        server->sequence_number = 0;
                        spin_unlock(&GlobalMid_Lock);
        /*              atomic_set(&server->inFlight,0);*/
                        wake_up(&server->response_q);
@@ -629,9 +641,9 @@ multi_t2_fnd:
        } /* end while !EXITING */
        /* take it off the list, if it's not already */
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_del_init(&server->tcp_ses_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        spin_lock(&GlobalMid_Lock);
        server->tcpStatus = CifsExiting;
@@ -669,7 +681,7 @@ multi_t2_fnd:
         * BB: we shouldn't have to do any of this. It shouldn't be
         * possible to exit from the thread with active SMB sessions
         */
-        read_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        if (list_empty(&server->pending_mid_q)) {
                /* loop through server session structures attached to this and
                    mark them dead */
@@ -679,7 +691,7 @@ multi_t2_fnd:
                        ses->status = CifsExiting;
                        ses->server = NULL;
                }
-                read_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
        } else {
                /* although we can not zero the server struct pointer yet,
                since there are active requests which may depnd on them,
@@ -702,7 +714,7 @@ multi_t2_fnd:
                        }
                }
                spin_unlock(&GlobalMid_Lock);
-                read_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                /* 1/8th of sec is more than enough time for them to exit */
                msleep(125);
        }
@@ -725,12 +737,12 @@ multi_t2_fnd:
        if a crazy root user tried to kill cifsd
        kernel thread explicitly this might happen) */
        /* BB: This shouldn't be necessary, see above */
-        read_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &server->smb_ses_list) {
                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
                ses->server = NULL;
        }
-        read_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        kfree(server->hostname);
        task_to_wake = xchg(&server->tsk, NULL);
@@ -1046,6 +1058,22 @@ cifs_parse_mount_options(char *options, const char *devname,
                                                    "long\n");
                                return 1;
                        }
+                } else if (strnicmp(data, "srcaddr", 7) == 0) {
+                        vol->srcaddr.ss_family = AF_UNSPEC;
+                        if (!value || !*value) {
+                                printk(KERN_WARNING "CIFS: srcaddr value"
+                                       " not specified.\n");
+                                return 1;       /* needs_arg; */
+                        }
+                        i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
+                                                 value, strlen(value));
+                        if (i == 0) {
+                                printk(KERN_WARNING "CIFS:  Could not parse"
+                                       " srcaddr: %s\n",
+                                       value);
+                                return 1;
+                        }
                } else if (strnicmp(data, "prefixpath", 10) == 0) {
                        if (!value || !*value) {
                                printk(KERN_WARNING
@@ -1325,6 +1353,10 @@ cifs_parse_mount_options(char *options, const char *devname,
                                "/proc/fs/cifs/LookupCacheEnabled to 0\n");
                } else if (strnicmp(data, "fsc", 3) == 0) {
                        vol->fsc = true;
+                } else if (strnicmp(data, "mfsymlinks", 10) == 0) {
+                        vol->mfsymlinks = true;
+                } else if (strnicmp(data, "multiuser", 8) == 0) {
+                        vol->multiuser = true;
                } else
                        printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
                                                data);
@@ -1356,6 +1388,13 @@ cifs_parse_mount_options(char *options, const char *devname,
                        return 1;
                }
        }
+        if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
+                cERROR(1, "Multiuser mounts currently require krb5 "
+                          "authentication!");
+                return 1;
+        }
        if (vol->UNCip == NULL)
                vol->UNCip = &vol->UNC[2];
@@ -1374,8 +1413,36 @@ cifs_parse_mount_options(char *options, const char *devname,
        return 0;
 }
+/** Returns true if srcaddr isn't specified and rhs isn't
+ * specified, or if srcaddr is specified and
+ * matches the IP address of the rhs argument.
+ */
+static bool
+srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
+{
+        switch (srcaddr->sa_family) {
+        case AF_UNSPEC:
+                return (rhs->sa_family == AF_UNSPEC);
+        case AF_INET: {
+                struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr;
+                struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs;
+                return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr);
+        }
+        case AF_INET6: {
+                struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
+                struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
+                return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
+        }
+        default:
+                WARN_ON(1);
+                return false; /* don't expect to be here */
+        }
+}
 static bool
-match_address(struct TCP_Server_Info *server, struct sockaddr *addr)
+match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
+              struct sockaddr *srcaddr)
 {
        struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
@@ -1402,6 +1469,9 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr)
                break;
        }
+        if (!srcip_matches(srcaddr, (struct sockaddr *)&server->srcaddr))
+                return false;
        return true;
 }
@@ -1458,29 +1528,21 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 {
        struct TCP_Server_Info *server;
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
-                /*
+                if (!match_address(server, addr,
-                 * the demux thread can exit on its own while still in CifsNew
+                                   (struct sockaddr *)&vol->srcaddr))
-                 * so don't accept any sockets in that state. Since the
-                 * tcpStatus never changes back to CifsNew it's safe to check
-                 * for this without a lock.
-                 */
-                if (server->tcpStatus == CifsNew)
-                        continue;
-                if (!match_address(server, addr))
                        continue;
                if (!match_security(server, vol))
                        continue;
                ++server->srv_count;
-                write_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                cFYI(1, "Existing tcp session with server found");
                return server;
        }
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        return NULL;
 }
@@ -1489,21 +1551,26 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 {
        struct task_struct *task;
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        if (--server->srv_count > 0) {
-                write_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                return;
        }
        list_del_init(&server->tcp_ses_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        spin_lock(&GlobalMid_Lock);
        server->tcpStatus = CifsExiting;
        spin_unlock(&GlobalMid_Lock);
+        cifs_crypto_shash_release(server);
        cifs_fscache_release_client_cookie(server);
+        kfree(server->session_key.response);
+        server->session_key.response = NULL;
+        server->session_key.len = 0;
        task = xchg(&server->tsk, NULL);
        if (task)
                force_sig(SIGKILL, task);
@@ -1556,10 +1623,16 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                goto out_err;
        }
+        rc = cifs_crypto_shash_allocate(tcp_ses);
+        if (rc) {
+                cERROR(1, "could not setup hash structures rc %d", rc);
+                goto out_err;
+        }
        tcp_ses->hostname = extract_hostname(volume_info->UNC);
        if (IS_ERR(tcp_ses->hostname)) {
                rc = PTR_ERR(tcp_ses->hostname);
-                goto out_err;
+                goto out_err_crypto_release;
        }
        tcp_ses->noblocksnd = volume_info->noblocksnd;
@@ -1574,6 +1647,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
        memcpy(tcp_ses->server_RFC1001_name,
                volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+        tcp_ses->session_estab = false;
        tcp_ses->sequence_number = 0;
        INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
        INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
@@ -1584,6 +1658,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
         * no need to spinlock this init of tcpStatus or srv_count
         */
        tcp_ses->tcpStatus = CifsNew;
+        memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
+               sizeof(tcp_ses->srcaddr));
        ++tcp_ses->srv_count;
        if (addr.ss_family == AF_INET6) {
@@ -1600,7 +1676,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        }
        if (rc < 0) {
                cERROR(1, "Error connecting to socket. Aborting operation");
-                goto out_err;
+                goto out_err_crypto_release;
        }
        /*
@@ -1614,18 +1690,21 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                rc = PTR_ERR(tcp_ses->tsk);
                cERROR(1, "error %d create cifsd thread", rc);
                module_put(THIS_MODULE);
-                goto out_err;
+                goto out_err_crypto_release;
        }
        /* thread spawned, put it on the list */
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        cifs_fscache_get_client_cookie(tcp_ses);
        return tcp_ses;
+out_err_crypto_release:
+        cifs_crypto_shash_release(tcp_ses);
 out_err:
        if (tcp_ses) {
                if (!IS_ERR(tcp_ses->hostname))
@@ -1642,7 +1721,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
        struct cifsSesInfo *ses;
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
                switch (server->secType) {
                case Kerberos:
@@ -1662,10 +1741,10 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
                                continue;
                }
                ++ses->ses_count;
-                write_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                return ses;
        }
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        return NULL;
 }
@@ -1676,14 +1755,14 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
        struct TCP_Server_Info *server = ses->server;
        cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        if (--ses->ses_count > 0) {
-                write_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                return;
        }
        list_del_init(&ses->smb_ses_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        if (ses->status == CifsGood) {
                xid = GetXid();
@@ -1760,10 +1839,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
                        goto get_ses_fail;
        }
        if (volume_info->domainname) {
-                int len = strlen(volume_info->domainname);
+                ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL);
-                ses->domainName = kmalloc(len + 1, GFP_KERNEL);
+                if (!ses->domainName)
-                if (ses->domainName)
+                        goto get_ses_fail;
-                        strcpy(ses->domainName, volume_info->domainname);
        }
        ses->cred_uid = volume_info->cred_uid;
        ses->linux_uid = volume_info->linux_uid;
@@ -1778,9 +1856,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
                goto get_ses_fail;
        /* success, put it on the list */
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_add(&ses->smb_ses_list, &server->smb_ses_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        FreeXid(xid);
        return ses;
@@ -1797,7 +1875,7 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
        struct list_head *tmp;
        struct cifsTconInfo *tcon;
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &ses->tcon_list) {
                tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
                if (tcon->tidStatus == CifsExiting)
@@ -1806,10 +1884,10 @@ cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
                        continue;
                ++tcon->tc_count;
-                write_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                return tcon;
        }
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        return NULL;
 }
@@ -1820,14 +1898,14 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
        struct cifsSesInfo *ses = tcon->ses;
        cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        if (--tcon->tc_count > 0) {
-                write_unlock(&cifs_tcp_ses_lock);
+                spin_unlock(&cifs_tcp_ses_lock);
                return;
        }
        list_del_init(&tcon->tcon_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        xid = GetXid();
        CIFSSMBTDis(xid, tcon);
@@ -1900,9 +1978,9 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
        tcon->nocase = volume_info->nocase;
        tcon->local_lease = volume_info->local_lease;
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_add(&tcon->tcon_list, &ses->tcon_list);
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        cifs_fscache_get_super_cookie(tcon);
@@ -1913,6 +1991,23 @@ out_fail:
        return ERR_PTR(rc);
 }
+void
+cifs_put_tlink(struct tcon_link *tlink)
+{
+        if (!tlink || IS_ERR(tlink))
+                return;
+        if (!atomic_dec_and_test(&tlink->tl_count) ||
+            test_bit(TCON_LINK_IN_TREE, &tlink->tl_flags)) {
+                tlink->tl_time = jiffies;
+                return;
+        }
+        if (!IS_ERR(tlink_tcon(tlink)))
+                cifs_put_tcon(tlink_tcon(tlink));
+        kfree(tlink);
+        return;
+}
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
@@ -1997,6 +2092,33 @@ static void rfc1002mangle(char *target, char *source, unsigned int length)
 }
+static int
+bind_socket(struct TCP_Server_Info *server)
+{
+        int rc = 0;
+        if (server->srcaddr.ss_family != AF_UNSPEC) {
+                /* Bind to the specified local IP address */
+                struct socket *socket = server->ssocket;
+                rc = socket->ops->bind(socket,
+                                       (struct sockaddr *) &server->srcaddr,
+                                       sizeof(server->srcaddr));
+                if (rc < 0) {
+                        struct sockaddr_in *saddr4;
+                        struct sockaddr_in6 *saddr6;
+                        saddr4 = (struct sockaddr_in *)&server->srcaddr;
+                        saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
+                        if (saddr6->sin6_family == AF_INET6)
+                                cERROR(1, "cifs: "
+                                       "Failed to bind to: %pI6c, error: %d\n",
+                                       &saddr6->sin6_addr, rc);
+                        else
+                                cERROR(1, "cifs: "
+                                       "Failed to bind to: %pI4, error: %d\n",
+                                       &saddr4->sin_addr.s_addr, rc);
+                }
+        }
+        return rc;
+}
 static int
 ipv4_connect(struct TCP_Server_Info *server)
@@ -2022,6 +2144,10 @@ ipv4_connect(struct TCP_Server_Info *server)
                cifs_reclassify_socket4(socket);
        }
+        rc = bind_socket(server);
+        if (rc < 0)
+                return rc;
        /* user overrode default port */
        if (server->addr.sockAddr.sin_port) {
                rc = socket->ops->connect(socket, (struct sockaddr *)
@@ -2184,6 +2310,10 @@ ipv6_connect(struct TCP_Server_Info *server)
                cifs_reclassify_socket6(socket);
        }
+        rc = bind_socket(server);
+        if (rc < 0)
+                return rc;
        /* user overrode default port */
        if (server->addr.sockAddr6.sin6_port) {
                rc = socket->ops->connect(socket,
@@ -2383,6 +2513,8 @@ convert_delimiter(char *path, char delim)
 static void setup_cifs_sb(struct smb_vol *pvolume_info,
                          struct cifs_sb_info *cifs_sb)
 {
+        INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
        if (pvolume_info->rsize > CIFSMaxBufSize) {
                cERROR(1, "rsize %d too large, using MaxBufSize",
                        pvolume_info->rsize);
@@ -2462,10 +2594,21 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
        if (pvolume_info->fsc)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
+        if (pvolume_info->multiuser)
+                cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
+                                            CIFS_MOUNT_NO_PERM);
        if (pvolume_info->direct_io) {
                cFYI(1, "mounting share using direct i/o");
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
        }
+        if (pvolume_info->mfsymlinks) {
+                if (pvolume_info->sfu_emul) {
+                        cERROR(1,  "mount option mfsymlinks ignored if sfu "
+                                   "mount option is used");
+                } else {
+                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
+                }
+        }
        if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
                cERROR(1, "mount option dynperm ignored if cifsacl "
@@ -2552,6 +2695,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        struct TCP_Server_Info *srvTcp;
        char   *full_path;
        char *mount_data = mount_data_global;
+        struct tcon_link *tlink;
 #ifdef CONFIG_CIFS_DFS_UPCALL
        struct dfs_info3_param *referrals = NULL;
        unsigned int num_referrals = 0;
@@ -2563,6 +2707,7 @@ try_mount_again:
        pSesInfo = NULL;
        srvTcp = NULL;
        full_path = NULL;
+        tlink = NULL;
        xid = GetXid();
@@ -2638,8 +2783,6 @@ try_mount_again:
                goto remote_path_check;
        }
-        cifs_sb->tcon = tcon;
        /* do not care if following two calls succeed - informational */
        if (!tcon->ipc) {
                CIFSSMBQFSDeviceInfo(xid, tcon);
@@ -2748,6 +2891,30 @@ remote_path_check:
 #endif
        }
+        if (rc)
+                goto mount_fail_check;
+        /* now, hang the tcon off of the superblock */
+        tlink = kzalloc(sizeof *tlink, GFP_KERNEL);
+        if (tlink == NULL) {
+                rc = -ENOMEM;
+                goto mount_fail_check;
+        }
+        tlink->tl_uid = pSesInfo->linux_uid;
+        tlink->tl_tcon = tcon;
+        tlink->tl_time = jiffies;
+        set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
+        set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+        cifs_sb->master_tlink = tlink;
+        spin_lock(&cifs_sb->tlink_tree_lock);
+        tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+        spin_unlock(&cifs_sb->tlink_tree_lock);
+        queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+                                TLINK_IDLE_EXPIRE);
 mount_fail_check:
        /* on error free sesinfo and tcon struct if needed */
        if (rc) {
@@ -2825,14 +2992,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
                    (ses->server->secType == LANMAN))
-                        calc_lanman_hash(tcon->password, ses->server->cryptKey,
+                        calc_lanman_hash(tcon->password, ses->server->cryptkey,
                                         ses->server->secMode &
                                            SECMODE_PW_ENCRYPT ? true : false,
                                         bcc_ptr);
                else
 #endif /* CIFS_WEAK_PW_HASH */
-                SMBNTencrypt(tcon->password, ses->server->cryptKey,
+                SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
-                             bcc_ptr);
                bcc_ptr += CIFS_SESS_KEY_SIZE;
                if (ses->capabilities & CAP_UNICODE) {
@@ -2934,19 +3100,32 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 int
 cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 {
-        int rc = 0;
+        struct rb_root *root = &cifs_sb->tlink_tree;
+        struct rb_node *node;
+        struct tcon_link *tlink;
        char *tmp;
-        if (cifs_sb->tcon)
+        cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
-                cifs_put_tcon(cifs_sb->tcon);
+        spin_lock(&cifs_sb->tlink_tree_lock);
+        while ((node = rb_first(root))) {
+                tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+                cifs_get_tlink(tlink);
+                clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+                rb_erase(node, root);
+                spin_unlock(&cifs_sb->tlink_tree_lock);
+                cifs_put_tlink(tlink);
+                spin_lock(&cifs_sb->tlink_tree_lock);
+        }
+        spin_unlock(&cifs_sb->tlink_tree_lock);
-        cifs_sb->tcon = NULL;
        tmp = cifs_sb->prepath;
        cifs_sb->prepathlen = 0;
        cifs_sb->prepath = NULL;
        kfree(tmp);
-        return rc;
+        return 0;
 }
 int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
@@ -2997,6 +3176,16 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
        if (rc) {
                cERROR(1, "Send error in SessSetup = %d", rc);
        } else {
+                mutex_lock(&ses->server->srv_mutex);
+                if (!server->session_estab) {
+                        server->session_key.response = ses->auth_key.response;
+                        server->session_key.len = ses->auth_key.len;
+                        server->sequence_number = 0x2;
+                        server->session_estab = true;
+                        ses->auth_key.response = NULL;
+                }
+                mutex_unlock(&server->srv_mutex);
                cFYI(1, "CIFS Session Established successfully");
                spin_lock(&GlobalMid_Lock);
                ses->status = CifsGood;
@@ -3004,6 +3193,263 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
                spin_unlock(&GlobalMid_Lock);
        }
+        kfree(ses->auth_key.response);
+        ses->auth_key.response = NULL;
+        ses->auth_key.len = 0;
+        kfree(ses->ntlmssp);
+        ses->ntlmssp = NULL;
        return rc;
 }
+static struct cifsTconInfo *
+cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
+{
+        struct cifsTconInfo *master_tcon = cifs_sb_master_tcon(cifs_sb);
+        struct cifsSesInfo *ses;
+        struct cifsTconInfo *tcon = NULL;
+        struct smb_vol *vol_info;
+        char username[MAX_USERNAME_SIZE + 1];
+        vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
+        if (vol_info == NULL) {
+                tcon = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        snprintf(username, MAX_USERNAME_SIZE, "krb50x%x", fsuid);
+        vol_info->username = username;
+        vol_info->local_nls = cifs_sb->local_nls;
+        vol_info->linux_uid = fsuid;
+        vol_info->cred_uid = fsuid;
+        vol_info->UNC = master_tcon->treeName;
+        vol_info->retry = master_tcon->retry;
+        vol_info->nocase = master_tcon->nocase;
+        vol_info->local_lease = master_tcon->local_lease;
+        vol_info->no_linux_ext = !master_tcon->unix_ext;
+        /* FIXME: allow for other secFlg settings */
+        vol_info->secFlg = CIFSSEC_MUST_KRB5;
+        /* get a reference for the same TCP session */
+        spin_lock(&cifs_tcp_ses_lock);
+        ++master_tcon->ses->server->srv_count;
+        spin_unlock(&cifs_tcp_ses_lock);
+        ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
+        if (IS_ERR(ses)) {
+                tcon = (struct cifsTconInfo *)ses;
+                cifs_put_tcp_session(master_tcon->ses->server);
+                goto out;
+        }
+        tcon = cifs_get_tcon(ses, vol_info);
+        if (IS_ERR(tcon)) {
+                cifs_put_smb_ses(ses);
+                goto out;
+        }
+        if (ses->capabilities & CAP_UNIX)
+                reset_cifs_unix_caps(0, tcon, NULL, vol_info);
+out:
+        kfree(vol_info);
+        return tcon;
+}
+static inline struct tcon_link *
+cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
+{
+        return cifs_sb->master_tlink;
+}
+struct cifsTconInfo *
+cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
+{
+        return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
+}
+static int
+cifs_sb_tcon_pending_wait(void *unused)
+{
+        schedule();
+        return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+/* find and return a tlink with given uid */
+static struct tcon_link *
+tlink_rb_search(struct rb_root *root, uid_t uid)
+{
+        struct rb_node *node = root->rb_node;
+        struct tcon_link *tlink;
+        while (node) {
+                tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+                if (tlink->tl_uid > uid)
+                        node = node->rb_left;
+                else if (tlink->tl_uid < uid)
+                        node = node->rb_right;
+                else
+                        return tlink;
+        }
+        return NULL;
+}
+/* insert a tcon_link into the tree */
+static void
+tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
+{
+        struct rb_node **new = &(root->rb_node), *parent = NULL;
+        struct tcon_link *tlink;
+        while (*new) {
+                tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
+                parent = *new;
+                if (tlink->tl_uid > new_tlink->tl_uid)
+                        new = &((*new)->rb_left);
+                else
+                        new = &((*new)->rb_right);
+        }
+        rb_link_node(&new_tlink->tl_rbnode, parent, new);
+        rb_insert_color(&new_tlink->tl_rbnode, root);
+}
+/*
+ * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
+ * current task.
+ *
+ * If the superblock doesn't refer to a multiuser mount, then just return
+ * the master tcon for the mount.
+ *
+ * First, search the rbtree for an existing tcon for this fsuid. If one
+ * exists, then check to see if it's pending construction. If it is then wait
+ * for construction to complete. Once it's no longer pending, check to see if
+ * it failed and either return an error or retry construction, depending on
+ * the timeout.
+ *
+ * If one doesn't exist then insert a new tcon_link struct into the tree and
+ * try to construct a new one.
+ */
+struct tcon_link *
+cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
+{
+        int ret;
+        uid_t fsuid = current_fsuid();
+        struct tcon_link *tlink, *newtlink;
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+                return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
+        spin_lock(&cifs_sb->tlink_tree_lock);
+        tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
+        if (tlink)
+                cifs_get_tlink(tlink);
+        spin_unlock(&cifs_sb->tlink_tree_lock);
+        if (tlink == NULL) {
+                newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
+                if (newtlink == NULL)
+                        return ERR_PTR(-ENOMEM);
+                newtlink->tl_uid = fsuid;
+                newtlink->tl_tcon = ERR_PTR(-EACCES);
+                set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
+                set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
+                cifs_get_tlink(newtlink);
+                spin_lock(&cifs_sb->tlink_tree_lock);
+                /* was one inserted after previous search? */
+                tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
+                if (tlink) {
+                        cifs_get_tlink(tlink);
+                        spin_unlock(&cifs_sb->tlink_tree_lock);
+                        kfree(newtlink);
+                        goto wait_for_construction;
+                }
+                tlink = newtlink;
+                tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+                spin_unlock(&cifs_sb->tlink_tree_lock);
+        } else {
+wait_for_construction:
+                ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
+                                  cifs_sb_tcon_pending_wait,
+                                  TASK_INTERRUPTIBLE);
+                if (ret) {
+                        cifs_put_tlink(tlink);
+                        return ERR_PTR(ret);
+                }
+                /* if it's good, return it */
+                if (!IS_ERR(tlink->tl_tcon))
+                        return tlink;
+                /* return error if we tried this already recently */
+                if (time_before(jiffies, tlink->tl_time + TLINK_ERROR_EXPIRE)) {
+                        cifs_put_tlink(tlink);
+                        return ERR_PTR(-EACCES);
+                }
+                if (test_and_set_bit(TCON_LINK_PENDING, &tlink->tl_flags))
+                        goto wait_for_construction;
+        }
+        tlink->tl_tcon = cifs_construct_tcon(cifs_sb, fsuid);
+        clear_bit(TCON_LINK_PENDING, &tlink->tl_flags);
+        wake_up_bit(&tlink->tl_flags, TCON_LINK_PENDING);
+        if (IS_ERR(tlink->tl_tcon)) {
+                cifs_put_tlink(tlink);
+                return ERR_PTR(-EACCES);
+        }
+        return tlink;
+}
+/*
+ * periodic workqueue job that scans tcon_tree for a superblock and closes
+ * out tcons.
+ */
+static void
+cifs_prune_tlinks(struct work_struct *work)
+{
+        struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
+                                                    prune_tlinks.work);
+        struct rb_root *root = &cifs_sb->tlink_tree;
+        struct rb_node *node = rb_first(root);
+        struct rb_node *tmp;
+        struct tcon_link *tlink;
+        /*
+         * Because we drop the spinlock in the loop in order to put the tlink
+         * it's not guarded against removal of links from the tree. The only
+         * places that remove entries from the tree are this function and
+         * umounts. Because this function is non-reentrant and is canceled
+         * before umount can proceed, this is safe.
+         */
+        spin_lock(&cifs_sb->tlink_tree_lock);
+        node = rb_first(root);
+        while (node != NULL) {
+                tmp = node;
+                node = rb_next(tmp);
+                tlink = rb_entry(tmp, struct tcon_link, tl_rbnode);
+                if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) ||
+                    atomic_read(&tlink->tl_count) != 0 ||
+                    time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies))
+                        continue;
+                cifs_get_tlink(tlink);
+                clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
+                rb_erase(tmp, root);
+                spin_unlock(&cifs_sb->tlink_tree_lock);
+                cifs_put_tlink(tlink);
+                spin_lock(&cifs_sb->tlink_tree_lock);
+        }
+        spin_unlock(&cifs_sb->tlink_tree_lock);
+        queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
+                                TLINK_IDLE_EXPIRE);
+}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index f9ed0751cc12..3840eddbfb7a 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -54,18 +54,18 @@ build_path_from_dentry(struct dentry *direntry)
        int dfsplen;
        char *full_path;
        char dirsep;
-        struct cifs_sb_info *cifs_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        if (direntry == NULL)
                return NULL;  /* not much we can do if dentry is freed and
                we need to reopen the file after it was closed implicitly
                when the server crashed */
-        cifs_sb = CIFS_SB(direntry->d_sb);
        dirsep = CIFS_DIR_SEP(cifs_sb);
        pplen = cifs_sb->prepathlen;
-        if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS))
+        if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
-                dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1);
+                dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
        else
                dfsplen = 0;
 cifs_bp_rename_retry:
@@ -117,7 +117,7 @@ cifs_bp_rename_retry:
        /* BB test paths to Windows with '/' in the midst of prepath */
        if (dfsplen) {
-                strncpy(full_path, cifs_sb->tcon->treeName, dfsplen);
+                strncpy(full_path, tcon->treeName, dfsplen);
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
                        int i;
                        for (i = 0; i < dfsplen; i++) {
@@ -130,135 +130,6 @@ cifs_bp_rename_retry:
        return full_path;
 }
-struct cifsFileInfo *
-cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
-                  struct file *file, struct vfsmount *mnt, unsigned int oflags)
-{
-        int oplock = 0;
-        struct cifsFileInfo *pCifsFile;
-        struct cifsInodeInfo *pCifsInode;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
-        pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-        if (pCifsFile == NULL)
-                return pCifsFile;
-        if (oplockEnabled)
-                oplock = REQ_OPLOCK;
-        pCifsFile->netfid = fileHandle;
-        pCifsFile->pid = current->tgid;
-        pCifsFile->pInode = igrab(newinode);
-        pCifsFile->mnt = mnt;
-        pCifsFile->pfile = file;
-        pCifsFile->invalidHandle = false;
-        pCifsFile->closePend = false;
-        mutex_init(&pCifsFile->fh_mutex);
-        mutex_init(&pCifsFile->lock_mutex);
-        INIT_LIST_HEAD(&pCifsFile->llist);
-        atomic_set(&pCifsFile->count, 1);
-        INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
-        write_lock(&GlobalSMBSeslock);
-        list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
-        pCifsInode = CIFS_I(newinode);
-        if (pCifsInode) {
-                /* if readable file instance put first in list*/
-                if (oflags & FMODE_READ)
-                        list_add(&pCifsFile->flist, &pCifsInode->openFileList);
-                else
-                        list_add_tail(&pCifsFile->flist,
-                                      &pCifsInode->openFileList);
-                if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-                        pCifsInode->clientCanCacheAll = true;
-                        pCifsInode->clientCanCacheRead = true;
-                        cFYI(1, "Exclusive Oplock inode %p", newinode);
-                } else if ((oplock & 0xF) == OPLOCK_READ)
-                                pCifsInode->clientCanCacheRead = true;
-        }
-        write_unlock(&GlobalSMBSeslock);
-        file->private_data = pCifsFile;
-        return pCifsFile;
-}
-int cifs_posix_open(char *full_path, struct inode **pinode,
-                        struct super_block *sb, int mode, int oflags,
-                        __u32 *poplock, __u16 *pnetfid, int xid)
-{
-        int rc;
-        FILE_UNIX_BASIC_INFO *presp_data;
-        __u32 posix_flags = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifs_fattr fattr;
-        cFYI(1, "posix open %s", full_path);
-        presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
-        if (presp_data == NULL)
-                return -ENOMEM;
-/* So far cifs posix extensions can only map the following flags.
-   There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
-   so far we do not seem to need them, and we can treat them as local only */
-        if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
-                (FMODE_READ | FMODE_WRITE))
-                posix_flags = SMB_O_RDWR;
-        else if (oflags & FMODE_READ)
-                posix_flags = SMB_O_RDONLY;
-        else if (oflags & FMODE_WRITE)
-                posix_flags = SMB_O_WRONLY;
-        if (oflags & O_CREAT)
-                posix_flags |= SMB_O_CREAT;
-        if (oflags & O_EXCL)
-                posix_flags |= SMB_O_EXCL;
-        if (oflags & O_TRUNC)
-                posix_flags |= SMB_O_TRUNC;
-        /* be safe and imply O_SYNC for O_DSYNC */
-        if (oflags & O_DSYNC)
-                posix_flags |= SMB_O_SYNC;
-        if (oflags & O_DIRECTORY)
-                posix_flags |= SMB_O_DIRECTORY;
-        if (oflags & O_NOFOLLOW)
-                posix_flags |= SMB_O_NOFOLLOW;
-        if (oflags & O_DIRECT)
-                posix_flags |= SMB_O_DIRECT;
-        mode &= ~current_umask();
-        rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
-                        pnetfid, presp_data, poplock, full_path,
-                        cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc)
-                goto posix_open_ret;
-        if (presp_data->Type == cpu_to_le32(-1))
-                goto posix_open_ret; /* open ok, caller does qpathinfo */
-        if (!pinode)
-                goto posix_open_ret; /* caller does not need info */
-        cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
-        /* get new inode and set it up */
-        if (*pinode == NULL) {
-                cifs_fill_uniqueid(sb, &fattr);
-                *pinode = cifs_iget(sb, &fattr);
-                if (!*pinode) {
-                        rc = -ENOMEM;
-                        goto posix_open_ret;
-                }
-        } else {
-                cifs_fattr_to_inode(*pinode, &fattr);
-        }
-posix_open_ret:
-        kfree(presp_data);
-        return rc;
-}
 static void setup_cifs_dentry(struct cifsTconInfo *tcon,
                              struct dentry *direntry,
                              struct inode *newinode)
@@ -291,6 +162,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        int desiredAccess = GENERIC_READ | GENERIC_WRITE;
        __u16 fileHandle;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *tcon;
        char *full_path = NULL;
        FILE_ALL_INFO *buf = NULL;
@@ -300,21 +172,26 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        xid = GetXid();
        cifs_sb = CIFS_SB(inode->i_sb);
-        tcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
-        full_path = build_path_from_dentry(direntry);
+                FreeXid(xid);
-        if (full_path == NULL) {
+                return PTR_ERR(tlink);
-                rc = -ENOMEM;
-                goto cifs_create_out;
        }
+        tcon = tlink_tcon(tlink);
        if (oplockEnabled)
                oplock = REQ_OPLOCK;
        if (nd && (nd->flags & LOOKUP_OPEN))
-                oflags = nd->intent.open.flags;
+                oflags = nd->intent.open.file->f_flags;
        else
-                oflags = FMODE_READ | SMB_O_CREAT;
+                oflags = O_RDONLY | O_CREAT;
+        full_path = build_path_from_dentry(direntry);
+        if (full_path == NULL) {
+                rc = -ENOMEM;
+                goto cifs_create_out;
+        }
        if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
@@ -344,9 +221,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                /* if the file is going to stay open, then we
                   need to set the desired access properly */
                desiredAccess = 0;
-                if (oflags & FMODE_READ)
+                if (OPEN_FMODE(oflags) & FMODE_READ)
                        desiredAccess |= GENERIC_READ; /* is this too little? */
-                if (oflags & FMODE_WRITE)
+                if (OPEN_FMODE(oflags) & FMODE_WRITE)
                        desiredAccess |= GENERIC_WRITE;
                if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
@@ -375,7 +252,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
                create_options |= CREATE_OPTION_READONLY;
-        if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
+        if (tcon->ses->capabilities & CAP_NT_SMBS)
                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
                         desiredAccess, create_options,
                         &fileHandle, &oplock, buf, cifs_sb->local_nls,
@@ -467,8 +344,7 @@ cifs_create_set_dentry:
                        goto cifs_create_out;
                }
-                pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp,
+                pfile_info = cifs_new_fileinfo(fileHandle, filp, tlink, oplock);
-                                               nd->path.mnt, oflags);
                if (pfile_info == NULL) {
                        fput(filp);
                        CIFSSMBClose(xid, tcon, fileHandle);
@@ -481,6 +357,7 @@ cifs_create_set_dentry:
 cifs_create_out:
        kfree(buf);
        kfree(full_path);
+        cifs_put_tlink(tlink);
        FreeXid(xid);
        return rc;
 }
@@ -491,6 +368,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        int rc = -EPERM;
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
@@ -503,10 +381,14 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        if (!old_valid_dev(device_number))
                return -EINVAL;
-        xid = GetXid();
        cifs_sb = CIFS_SB(inode->i_sb);
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
+        xid = GetXid();
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
@@ -606,6 +488,7 @@ mknod_out:
        kfree(full_path);
        kfree(buf);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -619,6 +502,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        __u16 fileHandle = 0;
        bool posix_open = false;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        struct cifsFileInfo *cfile;
        struct inode *newInode = NULL;
@@ -633,7 +517,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        /* check whether path exists */
        cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
+                FreeXid(xid);
+                return (struct dentry *)tlink;
+        }
+        pTcon = tlink_tcon(tlink);
        /*
         * Don't allow the separator character in a path component.
@@ -644,8 +533,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                for (i = 0; i < direntry->d_name.len; i++)
                        if (direntry->d_name.name[i] == '\\') {
                                cFYI(1, "Invalid file name");
-                                FreeXid(xid);
+                                rc = -EINVAL;
-                                return ERR_PTR(-EINVAL);
+                                goto lookup_out;
                        }
        }
@@ -655,7 +544,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
         */
        if (nd && (nd->flags & LOOKUP_EXCL)) {
                d_instantiate(direntry, NULL);
-                return NULL;
+                rc = 0;
+                goto lookup_out;
        }
        /* can not grab the rename sem here since it would
@@ -663,8 +553,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        in which we already have the sb rename sem */
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
-                FreeXid(xid);
+                rc = -ENOMEM;
-                return ERR_PTR(-ENOMEM);
+                goto lookup_out;
        }
        if (direntry->d_inode != NULL) {
@@ -687,11 +577,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        if (pTcon->unix_ext) {
                if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
                     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
-                     (nd->intent.open.flags & O_CREAT)) {
+                     (nd->intent.open.file->f_flags & O_CREAT)) {
                        rc = cifs_posix_open(full_path, &newInode,
                                        parent_dir_inode->i_sb,
                                        nd->intent.open.create_mode,
-                                        nd->intent.open.flags, &oplock,
+                                        nd->intent.open.file->f_flags, &oplock,
                                        &fileHandle, xid);
                        /*
                         * The check below works around a bug in POSIX
@@ -727,9 +617,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                                goto lookup_out;
                        }
-                        cfile = cifs_new_fileinfo(newInode, fileHandle, filp,
+                        cfile = cifs_new_fileinfo(fileHandle, filp, tlink,
-                                                  nd->path.mnt,
+                                                  oplock);
-                                                  nd->intent.open.flags);
                        if (cfile == NULL) {
                                fput(filp);
                                CIFSSMBClose(xid, pTcon, fileHandle);
@@ -759,6 +648,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 lookup_out:
        kfree(full_path);
+        cifs_put_tlink(tlink);
        FreeXid(xid);
        return ERR_PTR(rc);
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index de748c652d11..06c3e83fa387 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -60,34 +60,32 @@ static inline int cifs_convert_flags(unsigned int flags)
                FILE_READ_DATA);
 }
-static inline fmode_t cifs_posix_convert_flags(unsigned int flags)
+static u32 cifs_posix_convert_flags(unsigned int flags)
 {
-        fmode_t posix_flags = 0;
+        u32 posix_flags = 0;
        if ((flags & O_ACCMODE) == O_RDONLY)
-                posix_flags = FMODE_READ;
+                posix_flags = SMB_O_RDONLY;
        else if ((flags & O_ACCMODE) == O_WRONLY)
-                posix_flags = FMODE_WRITE;
+                posix_flags = SMB_O_WRONLY;
-        else if ((flags & O_ACCMODE) == O_RDWR) {
+        else if ((flags & O_ACCMODE) == O_RDWR)
-                /* GENERIC_ALL is too much permission to request
+                posix_flags = SMB_O_RDWR;
-                   can cause unnecessary access denied on create */
-                /* return GENERIC_ALL; */
+        if (flags & O_CREAT)
-                posix_flags = FMODE_READ | FMODE_WRITE;
+                posix_flags |= SMB_O_CREAT;
-        }
+        if (flags & O_EXCL)
-        /* can not map O_CREAT or O_EXCL or O_TRUNC flags when
+                posix_flags |= SMB_O_EXCL;
-           reopening a file.  They had their effect on the original open */
+        if (flags & O_TRUNC)
-        if (flags & O_APPEND)
+                posix_flags |= SMB_O_TRUNC;
-                posix_flags |= (fmode_t)O_APPEND;
+        /* be safe and imply O_SYNC for O_DSYNC */
        if (flags & O_DSYNC)
-                posix_flags |= (fmode_t)O_DSYNC;
+                posix_flags |= SMB_O_SYNC;
-        if (flags & __O_SYNC)
-                posix_flags |= (fmode_t)__O_SYNC;
        if (flags & O_DIRECTORY)
-                posix_flags |= (fmode_t)O_DIRECTORY;
+                posix_flags |= SMB_O_DIRECTORY;
        if (flags & O_NOFOLLOW)
-                posix_flags |= (fmode_t)O_NOFOLLOW;
+                posix_flags |= SMB_O_NOFOLLOW;
        if (flags & O_DIRECT)
-                posix_flags |= (fmode_t)O_DIRECT;
+                posix_flags |= SMB_O_DIRECT;
        return posix_flags;
 }
@@ -106,66 +104,8 @@ static inline int cifs_get_disposition(unsigned int flags)
                return FILE_OPEN;
 }
-/* all arguments to this function must be checked for validity in caller */
-static inline int
-cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
-                             struct cifsInodeInfo *pCifsInode, __u32 oplock,
-                             u16 netfid)
-{
-        write_lock(&GlobalSMBSeslock);
-        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
-        if (pCifsInode == NULL) {
-                write_unlock(&GlobalSMBSeslock);
-                return -EINVAL;
-        }
-        if (pCifsInode->clientCanCacheRead) {
-                /* we have the inode open somewhere else
-                   no need to discard cache data */
-                goto psx_client_can_cache;
-        }
-        /* BB FIXME need to fix this check to move it earlier into posix_open
-           BB  fIX following section BB FIXME */
-        /* if not oplocked, invalidate inode pages if mtime or file
-           size changed */
-/*      temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
-        if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
-                           (file->f_path.dentry->d_inode->i_size ==
-                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
-                cFYI(1, "inode unchanged on server");
-        } else {
-                if (file->f_path.dentry->d_inode->i_mapping) {
-                        rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
-                        if (rc != 0)
-                                CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
-                }
-                cFYI(1, "invalidating remote inode since open detected it "
-                         "changed");
-                invalidate_remote_inode(file->f_path.dentry->d_inode);
-        } */
-psx_client_can_cache:
-        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-                pCifsInode->clientCanCacheAll = true;
-                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, "Exclusive Oplock granted on inode %p",
-                         file->f_path.dentry->d_inode);
-        } else if ((oplock & 0xF) == OPLOCK_READ)
-                pCifsInode->clientCanCacheRead = true;
-        /* will have to change the unlock if we reenable the
-           filemap_fdatawrite (which does not seem necessary */
-        write_unlock(&GlobalSMBSeslock);
-        return 0;
-}
-/* all arguments to this function must be checked for validity in caller */
 static inline int cifs_open_inode_helper(struct inode *inode,
-        struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf,
+        struct cifsTconInfo *pTcon, __u32 oplock, FILE_ALL_INFO *buf,
        char *full_path, int xid)
 {
        struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
@@ -191,8 +131,7 @@ static inline int cifs_open_inode_helper(struct inode *inode,
                        /* BB no need to lock inode until after invalidate
                        since namei code should already have it locked? */
                        rc = filemap_write_and_wait(inode->i_mapping);
-                        if (rc != 0)
+                        mapping_set_error(inode->i_mapping, rc);
-                                pCifsInode->write_behind_rc = rc;
                }
                cFYI(1, "invalidating remote inode since open detected it "
                         "changed");
@@ -207,16 +146,166 @@ client_can_cache:
                rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
                                         xid, NULL);
-        if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+        cifs_set_oplock_level(pCifsInode, oplock);
-                pCifsInode->clientCanCacheAll = true;
-                pCifsInode->clientCanCacheRead = true;
+        return rc;
-                cFYI(1, "Exclusive Oplock granted on inode %p", inode);
+}
-        } else if ((*oplock & 0xF) == OPLOCK_READ)
-                pCifsInode->clientCanCacheRead = true;
+int cifs_posix_open(char *full_path, struct inode **pinode,
+                        struct super_block *sb, int mode, unsigned int f_flags,
+                        __u32 *poplock, __u16 *pnetfid, int xid)
+{
+        int rc;
+        FILE_UNIX_BASIC_INFO *presp_data;
+        __u32 posix_flags = 0;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        struct cifs_fattr fattr;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *tcon;
+        cFYI(1, "posix open %s", full_path);
+        presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+        if (presp_data == NULL)
+                return -ENOMEM;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
+                rc = PTR_ERR(tlink);
+                goto posix_open_ret;
+        }
+        tcon = tlink_tcon(tlink);
+        mode &= ~current_umask();
+        posix_flags = cifs_posix_convert_flags(f_flags);
+        rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
+                             poplock, full_path, cifs_sb->local_nls,
+                             cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+        cifs_put_tlink(tlink);
+        if (rc)
+                goto posix_open_ret;
+        if (presp_data->Type == cpu_to_le32(-1))
+                goto posix_open_ret; /* open ok, caller does qpathinfo */
+        if (!pinode)
+                goto posix_open_ret; /* caller does not need info */
+        cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb);
+        /* get new inode and set it up */
+        if (*pinode == NULL) {
+                cifs_fill_uniqueid(sb, &fattr);
+                *pinode = cifs_iget(sb, &fattr);
+                if (!*pinode) {
+                        rc = -ENOMEM;
+                        goto posix_open_ret;
+                }
+        } else {
+                cifs_fattr_to_inode(*pinode, &fattr);
+        }
+posix_open_ret:
+        kfree(presp_data);
        return rc;
 }
+struct cifsFileInfo *
+cifs_new_fileinfo(__u16 fileHandle, struct file *file,
+                  struct tcon_link *tlink, __u32 oplock)
+{
+        struct dentry *dentry = file->f_path.dentry;
+        struct inode *inode = dentry->d_inode;
+        struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
+        struct cifsFileInfo *pCifsFile;
+        pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+        if (pCifsFile == NULL)
+                return pCifsFile;
+        pCifsFile->count = 1;
+        pCifsFile->netfid = fileHandle;
+        pCifsFile->pid = current->tgid;
+        pCifsFile->uid = current_fsuid();
+        pCifsFile->dentry = dget(dentry);
+        pCifsFile->f_flags = file->f_flags;
+        pCifsFile->invalidHandle = false;
+        pCifsFile->tlink = cifs_get_tlink(tlink);
+        mutex_init(&pCifsFile->fh_mutex);
+        mutex_init(&pCifsFile->lock_mutex);
+        INIT_LIST_HEAD(&pCifsFile->llist);
+        INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
+        spin_lock(&cifs_file_list_lock);
+        list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
+        /* if readable file instance put first in list*/
+        if (file->f_mode & FMODE_READ)
+                list_add(&pCifsFile->flist, &pCifsInode->openFileList);
+        else
+                list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
+        spin_unlock(&cifs_file_list_lock);
+        cifs_set_oplock_level(pCifsInode, oplock);
+        file->private_data = pCifsFile;
+        return pCifsFile;
+}
+/*
+ * Release a reference on the file private data. This may involve closing
+ * the filehandle out on the server. Must be called without holding
+ * cifs_file_list_lock.
+ */
+void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+        struct inode *inode = cifs_file->dentry->d_inode;
+        struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        struct cifsLockInfo *li, *tmp;
+        spin_lock(&cifs_file_list_lock);
+        if (--cifs_file->count > 0) {
+                spin_unlock(&cifs_file_list_lock);
+                return;
+        }
+        /* remove it from the lists */
+        list_del(&cifs_file->flist);
+        list_del(&cifs_file->tlist);
+        if (list_empty(&cifsi->openFileList)) {
+                cFYI(1, "closing last open instance for inode %p",
+                        cifs_file->dentry->d_inode);
+                cifs_set_oplock_level(cifsi, 0);
+        }
+        spin_unlock(&cifs_file_list_lock);
+        if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
+                int xid, rc;
+                xid = GetXid();
+                rc = CIFSSMBClose(xid, tcon, cifs_file->netfid);
+                FreeXid(xid);
+        }
+        /* Delete any outstanding lock records. We'll lose them when the file
+         * is closed anyway.
+         */
+        mutex_lock(&cifs_file->lock_mutex);
+        list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
+                list_del(&li->llist);
+                kfree(li);
+        }
+        mutex_unlock(&cifs_file->lock_mutex);
+        cifs_put_tlink(cifs_file->tlink);
+        dput(cifs_file->dentry);
+        kfree(cifs_file);
+}
 int cifs_open(struct inode *inode, struct file *file)
 {
        int rc = -EACCES;
@@ -224,6 +313,7 @@ int cifs_open(struct inode *inode, struct file *file)
        __u32 oplock;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *tcon;
+        struct tcon_link *tlink;
        struct cifsFileInfo *pCifsFile = NULL;
        struct cifsInodeInfo *pCifsInode;
        char *full_path = NULL;
@@ -235,7 +325,12 @@ int cifs_open(struct inode *inode, struct file *file)
        xid = GetXid();
        cifs_sb = CIFS_SB(inode->i_sb);
-        tcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
+                FreeXid(xid);
+                return PTR_ERR(tlink);
+        }
+        tcon = tlink_tcon(tlink);
        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
@@ -257,27 +352,15 @@ int cifs_open(struct inode *inode, struct file *file)
            (tcon->ses->capabilities & CAP_UNIX) &&
            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-                int oflags = (int) cifs_posix_convert_flags(file->f_flags);
-                oflags |= SMB_O_CREAT;
                /* can not refresh inode info since size could be stale */
                rc = cifs_posix_open(full_path, &inode, inode->i_sb,
                                cifs_sb->mnt_file_mode /* ignored */,
-                                oflags, &oplock, &netfid, xid);
+                                file->f_flags, &oplock, &netfid, xid);
                if (rc == 0) {
                        cFYI(1, "posix open succeeded");
-                        /* no need for special case handling of setting mode
-                           on read only files needed here */
-                        rc = cifs_posix_open_inode_helper(inode, file,
-                                        pCifsInode, oplock, netfid);
-                        if (rc != 0) {
-                                CIFSSMBClose(xid, tcon, netfid);
-                                goto out;
-                        }
-                        pCifsFile = cifs_new_fileinfo(inode, netfid, file,
+                        pCifsFile = cifs_new_fileinfo(netfid, file, tlink,
-                                                        file->f_path.mnt,
+                                                      oplock);
-                                                        oflags);
                        if (pCifsFile == NULL) {
                                CIFSSMBClose(xid, tcon, netfid);
                                rc = -ENOMEM;
@@ -345,7 +428,7 @@ int cifs_open(struct inode *inode, struct file *file)
                goto out;
        }
-        if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
+        if (tcon->ses->capabilities & CAP_NT_SMBS)
                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
                         desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
@@ -365,12 +448,11 @@ int cifs_open(struct inode *inode, struct file *file)
                goto out;
        }
-        rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid);
+        rc = cifs_open_inode_helper(inode, tcon, oplock, buf, full_path, xid);
        if (rc != 0)
                goto out;
-        pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
+        pCifsFile = cifs_new_fileinfo(netfid, file, tlink, oplock);
-                                        file->f_flags);
        if (pCifsFile == NULL) {
                rc = -ENOMEM;
                goto out;
@@ -402,6 +484,7 @@ out:
        kfree(buf);
        kfree(full_path);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -416,14 +499,13 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
        return rc;
 }
-static int cifs_reopen_file(struct file *file, bool can_flush)
+static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
 {
        int rc = -EACCES;
        int xid;
        __u32 oplock;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *tcon;
-        struct cifsFileInfo *pCifsFile;
        struct cifsInodeInfo *pCifsInode;
        struct inode *inode;
        char *full_path = NULL;
@@ -431,11 +513,6 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        int disposition = FILE_OPEN;
        __u16 netfid;
-        if (file->private_data)
-                pCifsFile = file->private_data;
-        else
-                return -EBADF;
        xid = GetXid();
        mutex_lock(&pCifsFile->fh_mutex);
        if (!pCifsFile->invalidHandle) {
@@ -445,39 +522,24 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
                return rc;
        }
-        if (file->f_path.dentry == NULL) {
+        inode = pCifsFile->dentry->d_inode;
-                cERROR(1, "no valid name if dentry freed");
-                dump_stack();
-                rc = -EBADF;
-                goto reopen_error_exit;
-        }
-        inode = file->f_path.dentry->d_inode;
-        if (inode == NULL) {
-                cERROR(1, "inode not valid");
-                dump_stack();
-                rc = -EBADF;
-                goto reopen_error_exit;
-        }
        cifs_sb = CIFS_SB(inode->i_sb);
-        tcon = cifs_sb->tcon;
+        tcon = tlink_tcon(pCifsFile->tlink);
 /* can not grab rename sem here because various ops, including
   those that already have the rename sem can end up causing writepage
   to get called and if the server was down that means we end up here,
   and we can never tell if the caller already has the rename_sem */
-        full_path = build_path_from_dentry(file->f_path.dentry);
+        full_path = build_path_from_dentry(pCifsFile->dentry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-reopen_error_exit:
                mutex_unlock(&pCifsFile->fh_mutex);
                FreeXid(xid);
                return rc;
        }
        cFYI(1, "inode = 0x%p file flags 0x%x for %s",
-                 inode, file->f_flags, full_path);
+                 inode, pCifsFile->f_flags, full_path);
        if (oplockEnabled)
                oplock = REQ_OPLOCK;
@@ -487,8 +549,14 @@ reopen_error_exit:
        if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-                int oflags = (int) cifs_posix_convert_flags(file->f_flags);
-                /* can not refresh inode info since size could be stale */
+                /*
+                 * O_CREAT, O_EXCL and O_TRUNC already had their effect on the
+                 * original open. Must mask them off for a reopen.
+                 */
+                unsigned int oflags = pCifsFile->f_flags &
+                                                ~(O_CREAT | O_EXCL | O_TRUNC);
                rc = cifs_posix_open(full_path, NULL, inode->i_sb,
                                cifs_sb->mnt_file_mode /* ignored */,
                                oflags, &oplock, &netfid, xid);
@@ -500,7 +568,7 @@ reopen_error_exit:
                   in the reconnect path it is important to retry hard */
        }
-        desiredAccess = cifs_convert_flags(file->f_flags);
+        desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
        /* Can not refresh inode by passing in file_info buf to be returned
           by SMBOpen and then calling get_inode_info with returned buf
@@ -516,49 +584,38 @@ reopen_error_exit:
                mutex_unlock(&pCifsFile->fh_mutex);
                cFYI(1, "cifs_open returned 0x%x", rc);
                cFYI(1, "oplock: %d", oplock);
-        } else {
+                goto reopen_error_exit;
-reopen_success:
-                pCifsFile->netfid = netfid;
-                pCifsFile->invalidHandle = false;
-                mutex_unlock(&pCifsFile->fh_mutex);
-                pCifsInode = CIFS_I(inode);
-                if (pCifsInode) {
-                        if (can_flush) {
-                                rc = filemap_write_and_wait(inode->i_mapping);
-                                if (rc != 0)
-                                        CIFS_I(inode)->write_behind_rc = rc;
-                        /* temporarily disable caching while we
-                           go to server to get inode info */
-                                pCifsInode->clientCanCacheAll = false;
-                                pCifsInode->clientCanCacheRead = false;
-                                if (tcon->unix_ext)
-                                        rc = cifs_get_inode_info_unix(&inode,
-                                                full_path, inode->i_sb, xid);
-                                else
-                                        rc = cifs_get_inode_info(&inode,
-                                                full_path, NULL, inode->i_sb,
-                                                xid, NULL);
-                        } /* else we are writing out data to server already
-                             and could deadlock if we tried to flush data, and
-                             since we do not know if we have data that would
-                             invalidate the current end of file on the server
-                             we can not go to the server to get the new inod
-                             info */
-                        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-                                pCifsInode->clientCanCacheAll = true;
-                                pCifsInode->clientCanCacheRead = true;
-                                cFYI(1, "Exclusive Oplock granted on inode %p",
-                                         file->f_path.dentry->d_inode);
-                        } else if ((oplock & 0xF) == OPLOCK_READ) {
-                                pCifsInode->clientCanCacheRead = true;
-                                pCifsInode->clientCanCacheAll = false;
-                        } else {
-                                pCifsInode->clientCanCacheRead = false;
-                                pCifsInode->clientCanCacheAll = false;
-                        }
-                        cifs_relock_file(pCifsFile);
-                }
        }
+reopen_success:
+        pCifsFile->netfid = netfid;
+        pCifsFile->invalidHandle = false;
+        mutex_unlock(&pCifsFile->fh_mutex);
+        pCifsInode = CIFS_I(inode);
+        if (can_flush) {
+                rc = filemap_write_and_wait(inode->i_mapping);
+                mapping_set_error(inode->i_mapping, rc);
+                if (tcon->unix_ext)
+                        rc = cifs_get_inode_info_unix(&inode,
+                                full_path, inode->i_sb, xid);
+                else
+                        rc = cifs_get_inode_info(&inode,
+                                full_path, NULL, inode->i_sb,
+                                xid, NULL);
+        } /* else we are writing out data to server already
+             and could deadlock if we tried to flush data, and
+             since we do not know if we have data that would
+             invalidate the current end of file on the server
+             we can not go to the server to get the new inod
+             info */
+        cifs_set_oplock_level(pCifsInode, oplock);
+        cifs_relock_file(pCifsFile);
+reopen_error_exit:
        kfree(full_path);
        FreeXid(xid);
        return rc;
@@ -566,79 +623,11 @@ reopen_success:
 int cifs_close(struct inode *inode, struct file *file)
 {
-        int rc = 0;
+        cifsFileInfo_put(file->private_data);
-        int xid, timeout;
+        file->private_data = NULL;
-        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
-        struct cifsFileInfo *pSMBFile = file->private_data;
-        xid = GetXid();
-        cifs_sb = CIFS_SB(inode->i_sb);
+        /* return code from the ->release op is always ignored */
-        pTcon = cifs_sb->tcon;
+        return 0;
-        if (pSMBFile) {
-                struct cifsLockInfo *li, *tmp;
-                write_lock(&GlobalSMBSeslock);
-                pSMBFile->closePend = true;
-                if (pTcon) {
-                        /* no sense reconnecting to close a file that is
-                           already closed */
-                        if (!pTcon->need_reconnect) {
-                                write_unlock(&GlobalSMBSeslock);
-                                timeout = 2;
-                                while ((atomic_read(&pSMBFile->count) != 1)
-                                        && (timeout <= 2048)) {
-                                        /* Give write a better chance to get to
-                                        server ahead of the close.  We do not
-                                        want to add a wait_q here as it would
-                                        increase the memory utilization as
-                                        the struct would be in each open file,
-                                        but this should give enough time to
-                                        clear the socket */
-                                        cFYI(DBG2, "close delay, write pending");
-                                        msleep(timeout);
-                                        timeout *= 4;
-                                }
-                                if (!pTcon->need_reconnect &&
-                                    !pSMBFile->invalidHandle)
-                                        rc = CIFSSMBClose(xid, pTcon,
-                                                  pSMBFile->netfid);
-                        } else
-                                write_unlock(&GlobalSMBSeslock);
-                } else
-                        write_unlock(&GlobalSMBSeslock);
-                /* Delete any outstanding lock records.
-                   We'll lose them when the file is closed anyway. */
-                mutex_lock(&pSMBFile->lock_mutex);
-                list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) {
-                        list_del(&li->llist);
-                        kfree(li);
-                }
-                mutex_unlock(&pSMBFile->lock_mutex);
-                write_lock(&GlobalSMBSeslock);
-                list_del(&pSMBFile->flist);
-                list_del(&pSMBFile->tlist);
-                write_unlock(&GlobalSMBSeslock);
-                cifsFileInfo_put(file->private_data);
-                file->private_data = NULL;
-        } else
-                rc = -EBADF;
-        read_lock(&GlobalSMBSeslock);
-        if (list_empty(&(CIFS_I(inode)->openFileList))) {
-                cFYI(1, "closing last open instance for inode %p", inode);
-                /* if the file is not open we do not know if we can cache info
-                   on this inode, much less write behind and read ahead */
-                CIFS_I(inode)->clientCanCacheRead = false;
-                CIFS_I(inode)->clientCanCacheAll  = false;
-        }
-        read_unlock(&GlobalSMBSeslock);
-        if ((rc == 0) && CIFS_I(inode)->write_behind_rc)
-                rc = CIFS_I(inode)->write_behind_rc;
-        FreeXid(xid);
-        return rc;
 }
 int cifs_closedir(struct inode *inode, struct file *file)
@@ -653,25 +642,21 @@ int cifs_closedir(struct inode *inode, struct file *file)
        xid = GetXid();
        if (pCFileStruct) {
-                struct cifsTconInfo *pTcon;
+                struct cifsTconInfo *pTcon = tlink_tcon(pCFileStruct->tlink);
-                struct cifs_sb_info *cifs_sb =
-                        CIFS_SB(file->f_path.dentry->d_sb);
-                pTcon = cifs_sb->tcon;
                cFYI(1, "Freeing private data in close dir");
-                write_lock(&GlobalSMBSeslock);
+                spin_lock(&cifs_file_list_lock);
                if (!pCFileStruct->srch_inf.endOfSearch &&
                    !pCFileStruct->invalidHandle) {
                        pCFileStruct->invalidHandle = true;
-                        write_unlock(&GlobalSMBSeslock);
+                        spin_unlock(&cifs_file_list_lock);
                        rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
                        cFYI(1, "Closing uncompleted readdir with rc %d",
                                 rc);
                        /* not much we can do if it fails anyway, ignore rc */
                        rc = 0;
                } else
-                        write_unlock(&GlobalSMBSeslock);
+                        spin_unlock(&cifs_file_list_lock);
                ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
                if (ptmp) {
                        cFYI(1, "closedir free smb buf in srch struct");
@@ -681,6 +666,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
                        else
                                cifs_buf_release(ptmp);
                }
+                cifs_put_tlink(pCFileStruct->tlink);
                kfree(file->private_data);
                file->private_data = NULL;
        }
@@ -767,13 +753,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                cFYI(1, "Unknown type of lock");
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        tcon = cifs_sb->tcon;
+        tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
-        if (file->private_data == NULL) {
-                rc = -EBADF;
-                FreeXid(xid);
-                return rc;
-        }
        netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
        if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -949,6 +929,7 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        size_t write_size, loff_t *poffset)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        int rc = 0;
        unsigned int bytes_written = 0;
        unsigned int total_written;
@@ -956,18 +937,18 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        struct cifsTconInfo *pTcon;
        int xid, long_op;
        struct cifsFileInfo *open_file;
-        struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
        /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
           *poffset, file->f_path.dentry->d_name.name); */
        if (file->private_data == NULL)
                return -EBADF;
        open_file = file->private_data;
+        pTcon = tlink_tcon(open_file->tlink);
        rc = generic_write_checks(file, poffset, &write_size, 0);
        if (rc)
@@ -988,19 +969,12 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                           we blocked so return what we managed to write */
                                return total_written;
                        }
-                        if (open_file->closePend) {
-                                FreeXid(xid);
-                                if (total_written)
-                                        return total_written;
-                                else
-                                        return -EBADF;
-                        }
                        if (open_file->invalidHandle) {
                                /* we could deadlock if we called
                                   filemap_fdatawait from here so tell
                                   reopen_file not to flush data to server
                                   now */
-                                rc = cifs_reopen_file(file, false);
+                                rc = cifs_reopen_file(open_file, false);
                                if (rc != 0)
                                        break;
                        }
@@ -1029,27 +1003,24 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        cifs_stats_bytes_written(pTcon, total_written);
-        /* since the write may have blocked check these pointers again */
-        if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
-                struct inode *inode = file->f_path.dentry->d_inode;
 /* Do not update local mtime - server will set its actual value on write
- *              inode->i_ctime = inode->i_mtime =
+ *      inode->i_ctime = inode->i_mtime =
- *                      current_fs_time(inode->i_sb);*/
+ *              current_fs_time(inode->i_sb);*/
-                if (total_written > 0) {
+        if (total_written > 0) {
-                        spin_lock(&inode->i_lock);
+                spin_lock(&inode->i_lock);
-                        if (*poffset > file->f_path.dentry->d_inode->i_size)
+                if (*poffset > inode->i_size)
-                                i_size_write(file->f_path.dentry->d_inode,
+                        i_size_write(inode, *poffset);
-                                        *poffset);
+                spin_unlock(&inode->i_lock);
-                        spin_unlock(&inode->i_lock);
-                }
-                mark_inode_dirty_sync(file->f_path.dentry->d_inode);
        }
+        mark_inode_dirty_sync(inode);
        FreeXid(xid);
        return total_written;
 }
-static ssize_t cifs_write(struct file *file, const char *write_data,
+static ssize_t cifs_write(struct cifsFileInfo *open_file,
-                          size_t write_size, loff_t *poffset)
+                          const char *write_data, size_t write_size,
+                          loff_t *poffset)
 {
        int rc = 0;
        unsigned int bytes_written = 0;
@@ -1057,19 +1028,15 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
        int xid, long_op;
-        struct cifsFileInfo *open_file;
+        struct dentry *dentry = open_file->dentry;
-        struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
+        struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
+        cifs_sb = CIFS_SB(dentry->d_sb);
        cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
-           *poffset, file->f_path.dentry->d_name.name);
+           *poffset, dentry->d_name.name);
-        if (file->private_data == NULL)
+        pTcon = tlink_tcon(open_file->tlink);
-                return -EBADF;
-        open_file = file->private_data;
        xid = GetXid();
@@ -1078,28 +1045,12 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
             total_written += bytes_written) {
                rc = -EAGAIN;
                while (rc == -EAGAIN) {
-                        if (file->private_data == NULL) {
-                                /* file has been closed on us */
-                                FreeXid(xid);
-                        /* if we have gotten here we have written some data
-                           and blocked, and the file has been freed on us
-                           while we blocked so return what we managed to
-                           write */
-                                return total_written;
-                        }
-                        if (open_file->closePend) {
-                                FreeXid(xid);
-                                if (total_written)
-                                        return total_written;
-                                else
-                                        return -EBADF;
-                        }
                        if (open_file->invalidHandle) {
                                /* we could deadlock if we called
                                   filemap_fdatawait from here so tell
                                   reopen_file not to flush data to
                                   server now */
-                                rc = cifs_reopen_file(file, false);
+                                rc = cifs_reopen_file(open_file, false);
                                if (rc != 0)
                                        break;
                        }
@@ -1146,43 +1097,41 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        cifs_stats_bytes_written(pTcon, total_written);
-        /* since the write may have blocked check these pointers again */
+        if (total_written > 0) {
-        if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
+                spin_lock(&dentry->d_inode->i_lock);
-/*BB We could make this contingent on superblock ATIME flag too */
+                if (*poffset > dentry->d_inode->i_size)
-/*              file->f_path.dentry->d_inode->i_ctime =
+                        i_size_write(dentry->d_inode, *poffset);
-                file->f_path.dentry->d_inode->i_mtime = CURRENT_TIME;*/
+                spin_unlock(&dentry->d_inode->i_lock);
-                if (total_written > 0) {
-                        spin_lock(&file->f_path.dentry->d_inode->i_lock);
-                        if (*poffset > file->f_path.dentry->d_inode->i_size)
-                                i_size_write(file->f_path.dentry->d_inode,
-                                             *poffset);
-                        spin_unlock(&file->f_path.dentry->d_inode->i_lock);
-                }
-                mark_inode_dirty_sync(file->f_path.dentry->d_inode);
        }
+        mark_inode_dirty_sync(dentry->d_inode);
        FreeXid(xid);
        return total_written;
 }
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
+struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
+                                        bool fsuid_only)
 {
        struct cifsFileInfo *open_file = NULL;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+        /* only filter by fsuid on multiuser mounts */
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+                fsuid_only = false;
-        read_lock(&GlobalSMBSeslock);
+        spin_lock(&cifs_file_list_lock);
        /* we could simply get the first_list_entry since write-only entries
           are always at the end of the list but since the first entry might
           have a close pending, we go through the whole list */
        list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-                if (open_file->closePend)
+                if (fsuid_only && open_file->uid != current_fsuid())
                        continue;
-                if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) ||
+                if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
-                    (open_file->pfile->f_flags & O_RDONLY))) {
                        if (!open_file->invalidHandle) {
                                /* found a good file */
                                /* lock it so it will not be closed on us */
                                cifsFileInfo_get(open_file);
-                                read_unlock(&GlobalSMBSeslock);
+                                spin_unlock(&cifs_file_list_lock);
                                return open_file;
                        } /* else might as well continue, and look for
                             another, or simply have the caller reopen it
@@ -1190,14 +1139,16 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
                } else /* write only file */
                        break; /* write only files are last so must be done */
        }
-        read_unlock(&GlobalSMBSeslock);
+        spin_unlock(&cifs_file_list_lock);
        return NULL;
 }
 #endif
-struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
+struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
+                                        bool fsuid_only)
 {
        struct cifsFileInfo *open_file;
+        struct cifs_sb_info *cifs_sb;
        bool any_available = false;
        int rc;
@@ -1211,53 +1162,41 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
                return NULL;
        }
-        read_lock(&GlobalSMBSeslock);
+        cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+        /* only filter by fsuid on multiuser mounts */
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
+                fsuid_only = false;
+        spin_lock(&cifs_file_list_lock);
 refind_writable:
        list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-                if (open_file->closePend ||
+                if (!any_available && open_file->pid != current->tgid)
-                    (!any_available && open_file->pid != current->tgid))
                        continue;
+                if (fsuid_only && open_file->uid != current_fsuid())
-                if (open_file->pfile &&
+                        continue;
-                    ((open_file->pfile->f_flags & O_RDWR) ||
+                if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
-                     (open_file->pfile->f_flags & O_WRONLY))) {
                        cifsFileInfo_get(open_file);
                        if (!open_file->invalidHandle) {
                                /* found a good writable file */
-                                read_unlock(&GlobalSMBSeslock);
+                                spin_unlock(&cifs_file_list_lock);
                                return open_file;
                        }
-                        read_unlock(&GlobalSMBSeslock);
+                        spin_unlock(&cifs_file_list_lock);
                        /* Had to unlock since following call can block */
-                        rc = cifs_reopen_file(open_file->pfile, false);
+                        rc = cifs_reopen_file(open_file, false);
-                        if (!rc) {
+                        if (!rc)
-                                if (!open_file->closePend)
+                                return open_file;
-                                        return open_file;
-                                else { /* start over in case this was deleted */
-                                       /* since the list could be modified */
-                                        read_lock(&GlobalSMBSeslock);
-                                        cifsFileInfo_put(open_file);
-                                        goto refind_writable;
-                                }
-                        }
-                        /* if it fails, try another handle if possible -
+                        /* if it fails, try another handle if possible */
-                        (we can not do this if closePending since
-                        loop could be modified - in which case we
-                        have to start at the beginning of the list
-                        again. Note that it would be bad
-                        to hold up writepages here (rather than
-                        in caller) with continuous retries */
                        cFYI(1, "wp failed on reopen file");
-                        read_lock(&GlobalSMBSeslock);
-                        /* can not use this handle, no write
-                           pending on this one after all */
                        cifsFileInfo_put(open_file);
-                        if (open_file->closePend) /* list could have changed */
+                        spin_lock(&cifs_file_list_lock);
-                                goto refind_writable;
                        /* else we simply continue to the next entry. Thus
                           we do not loop on reopen errors.  If we
                           can not reopen the file, for example if we
@@ -1272,7 +1211,7 @@ refind_writable:
                any_available = true;
                goto refind_writable;
        }
-        read_unlock(&GlobalSMBSeslock);
+        spin_unlock(&cifs_file_list_lock);
        return NULL;
 }
@@ -1284,7 +1223,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
        int rc = -EFAULT;
        int bytes_written = 0;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
        struct inode *inode;
        struct cifsFileInfo *open_file;
@@ -1293,7 +1231,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
        inode = page->mapping->host;
        cifs_sb = CIFS_SB(inode->i_sb);
-        pTcon = cifs_sb->tcon;
        offset += (loff_t)from;
        write_data = kmap(page);
@@ -1314,10 +1251,10 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
        if (mapping->host->i_size - offset < (loff_t)to)
                to = (unsigned)(mapping->host->i_size - offset);
-        open_file = find_writable_file(CIFS_I(mapping->host));
+        open_file = find_writable_file(CIFS_I(mapping->host), false);
        if (open_file) {
-                bytes_written = cifs_write(open_file->pfile, write_data,
+                bytes_written = cifs_write(open_file, write_data,
-                                           to-from, &offset);
+                                           to - from, &offset);
                cifsFileInfo_put(open_file);
                /* Does mm or vfs already set times? */
                inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
@@ -1337,7 +1274,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 static int cifs_writepages(struct address_space *mapping,
                           struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        unsigned int bytes_to_write;
        unsigned int bytes_written;
        struct cifs_sb_info *cifs_sb;
@@ -1352,6 +1288,7 @@ static int cifs_writepages(struct address_space *mapping,
        int nr_pages;
        __u64 offset = 0;
        struct cifsFileInfo *open_file;
+        struct cifsTconInfo *tcon;
        struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
        struct page *page;
        struct pagevec pvec;
@@ -1368,26 +1305,29 @@ static int cifs_writepages(struct address_space *mapping,
        if (cifs_sb->wsize < PAGE_CACHE_SIZE)
                return generic_writepages(mapping, wbc);
-        if ((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server))
-                if (cifs_sb->tcon->ses->server->secMode &
-                                (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-                        if (!experimEnabled)
-                                return generic_writepages(mapping, wbc);
        iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
        if (iov == NULL)
                return generic_writepages(mapping, wbc);
        /*
-         * BB: Is this meaningful for a non-block-device file system?
+         * if there's no open file, then this is likely to fail too,
-         * If it is, we should test it again after we do I/O
+         * but it'll at least handle the return. Maybe it should be
+         * a BUG() instead?
         */
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
+        open_file = find_writable_file(CIFS_I(mapping->host), false);
-                wbc->encountered_congestion = 1;
+        if (!open_file) {
                kfree(iov);
-                return 0;
+                return generic_writepages(mapping, wbc);
+        }
+        tcon = tlink_tcon(open_file->tlink);
+        if (!experimEnabled && tcon->ses->server->secMode &
+                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+                cifsFileInfo_put(open_file);
+                kfree(iov);
+                return generic_writepages(mapping, wbc);
        }
+        cifsFileInfo_put(open_file);
        xid = GetXid();
@@ -1492,38 +1432,29 @@ retry:
                                break;
                }
                if (n_iov) {
-                        /* Search for a writable handle every time we call
+                        open_file = find_writable_file(CIFS_I(mapping->host),
-                         * CIFSSMBWrite2.  We can't rely on the last handle
+                                                        false);
-                         * we used to still be valid
-                         */
-                        open_file = find_writable_file(CIFS_I(mapping->host));
                        if (!open_file) {
                                cERROR(1, "No writable handles for inode");
                                rc = -EBADF;
                        } else {
                                long_op = cifs_write_timeout(cifsi, offset);
-                                rc = CIFSSMBWrite2(xid, cifs_sb->tcon,
+                                rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
-                                                   open_file->netfid,
                                                   bytes_to_write, offset,
                                                   &bytes_written, iov, n_iov,
                                                   long_op);
                                cifsFileInfo_put(open_file);
                                cifs_update_eof(cifsi, offset, bytes_written);
+                        }
-                                if (rc || bytes_written < bytes_to_write) {
+                        if (rc || bytes_written < bytes_to_write) {
-                                        cERROR(1, "Write2 ret %d, wrote %d",
+                                cERROR(1, "Write2 ret %d, wrote %d",
-                                                  rc, bytes_written);
+                                          rc, bytes_written);
-                                        /* BB what if continued retry is
+                                mapping_set_error(mapping, rc);
-                                           requested via mount flags? */
+                        } else {
-                                        if (rc == -ENOSPC)
+                                cifs_stats_bytes_written(tcon, bytes_written);
-                                                set_bit(AS_ENOSPC, &mapping->flags);
-                                        else
-                                                set_bit(AS_EIO, &mapping->flags);
-                                } else {
-                                        cifs_stats_bytes_written(cifs_sb->tcon,
-                                                                 bytes_written);
-                                }
                        }
                        for (i = 0; i < n_iov; i++) {
                                page = pvec.pages[first + i];
                                /* Should we also set page error on
@@ -1624,7 +1555,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
                /* BB check if anything else missing out of ppw
                   such as updating last write time */
                page_data = kmap(page);
-                rc = cifs_write(file, page_data + offset, copied, &pos);
+                rc = cifs_write(file->private_data, page_data + offset,
+                                copied, &pos);
                /* if (rc < 0) should we set writebehind rc? */
                kunmap(page);
@@ -1663,11 +1595,10 @@ int cifs_fsync(struct file *file, int datasync)
        rc = filemap_write_and_wait(inode->i_mapping);
        if (rc == 0) {
-                rc = CIFS_I(inode)->write_behind_rc;
+                struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-                CIFS_I(inode)->write_behind_rc = 0;
-                tcon = CIFS_SB(inode->i_sb)->tcon;
+                tcon = tlink_tcon(smbfile->tlink);
-                if (!rc && tcon && smbfile &&
+                if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
-                   !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
                        rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
        }
@@ -1712,21 +1643,8 @@ int cifs_flush(struct file *file, fl_owner_t id)
        struct inode *inode = file->f_path.dentry->d_inode;
        int rc = 0;
-        /* Rather than do the steps manually:
+        if (file->f_mode & FMODE_WRITE)
-           lock the inode for writing
+                rc = filemap_write_and_wait(inode->i_mapping);
-           loop through pages looking for write behind data (dirty pages)
-           coalesce into contiguous 16K (or smaller) chunks to write to server
-           send to server (prefer in parallel)
-           deal with writebehind errors
-           unlock inode for writing
-           filemapfdatawrite appears easier for the time being */
-        rc = filemap_fdatawrite(inode->i_mapping);
-        /* reset wb rc if we were able to write out dirty pages */
-        if (!rc) {
-                rc = CIFS_I(inode)->write_behind_rc;
-                CIFS_I(inode)->write_behind_rc = 0;
-        }
        cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
@@ -1750,7 +1668,6 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
        xid = GetXid();
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
        if (file->private_data == NULL) {
                rc = -EBADF;
@@ -1758,6 +1675,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
                return rc;
        }
        open_file = file->private_data;
+        pTcon = tlink_tcon(open_file->tlink);
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
                cFYI(1, "attempting read on write only file instance");
@@ -1771,9 +1689,8 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
                smb_read_data = NULL;
                while (rc == -EAGAIN) {
                        int buf_type = CIFS_NO_BUFFER;
-                        if ((open_file->invalidHandle) &&
+                        if (open_file->invalidHandle) {
-                            (!open_file->closePend)) {
+                                rc = cifs_reopen_file(open_file, true);
-                                rc = cifs_reopen_file(file, true);
                                if (rc != 0)
                                        break;
                        }
@@ -1831,7 +1748,6 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        xid = GetXid();
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
        if (file->private_data == NULL) {
                rc = -EBADF;
@@ -1839,6 +1755,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
                return rc;
        }
        open_file = file->private_data;
+        pTcon = tlink_tcon(open_file->tlink);
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
                cFYI(1, "attempting read on write only file instance");
@@ -1857,9 +1774,8 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
                }
                rc = -EAGAIN;
                while (rc == -EAGAIN) {
-                        if ((open_file->invalidHandle) &&
+                        if (open_file->invalidHandle) {
-                            (!open_file->closePend)) {
+                                rc = cifs_reopen_file(open_file, true);
-                                rc = cifs_reopen_file(file, true);
                                if (rc != 0)
                                        break;
                        }
@@ -1974,7 +1890,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        }
        open_file = file->private_data;
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
+        pTcon = tlink_tcon(open_file->tlink);
        /*
         * Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2022,9 +1938,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                                read_size, contig_pages);
                rc = -EAGAIN;
                while (rc == -EAGAIN) {
-                        if ((open_file->invalidHandle) &&
+                        if (open_file->invalidHandle) {
-                            (!open_file->closePend)) {
+                                rc = cifs_reopen_file(open_file, true);
-                                rc = cifs_reopen_file(file, true);
                                if (rc != 0)
                                        break;
                        }
@@ -2173,18 +2088,14 @@ static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
 {
        struct cifsFileInfo *open_file;
-        read_lock(&GlobalSMBSeslock);
+        spin_lock(&cifs_file_list_lock);
        list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-                if (open_file->closePend)
+                if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
-                        continue;
+                        spin_unlock(&cifs_file_list_lock);
-                if (open_file->pfile &&
-                    ((open_file->pfile->f_flags & O_RDWR) ||
-                     (open_file->pfile->f_flags & O_WRONLY))) {
-                        read_unlock(&GlobalSMBSeslock);
                        return 1;
                }
        }
-        read_unlock(&GlobalSMBSeslock);
+        spin_unlock(&cifs_file_list_lock);
        return 0;
 }
@@ -2310,10 +2221,9 @@ void cifs_oplock_break(struct work_struct *work)
 {
        struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
                                                  oplock_break);
-        struct inode *inode = cfile->pInode;
+        struct inode *inode = cfile->dentry->d_inode;
        struct cifsInodeInfo *cinode = CIFS_I(inode);
-        struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb);
+        int rc = 0;
-        int rc, waitrc = 0;
        if (inode && S_ISREG(inode->i_mode)) {
                if (cinode->clientCanCacheRead)
@@ -2322,13 +2232,10 @@ void cifs_oplock_break(struct work_struct *work)
                        break_lease(inode, O_WRONLY);
                rc = filemap_fdatawrite(inode->i_mapping);
                if (cinode->clientCanCacheRead == 0) {
-                        waitrc = filemap_fdatawait(inode->i_mapping);
+                        rc = filemap_fdatawait(inode->i_mapping);
+                        mapping_set_error(inode->i_mapping, rc);
                        invalidate_remote_inode(inode);
                }
-                if (!rc)
-                        rc = waitrc;
-                if (rc)
-                        cinode->write_behind_rc = rc;
                cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
        }
@@ -2338,33 +2245,34 @@ void cifs_oplock_break(struct work_struct *work)
         * not bother sending an oplock release if session to server still is
         * disconnected since oplock already released by the server
         */
-        if (!cfile->closePend && !cfile->oplock_break_cancelled) {
+        if (!cfile->oplock_break_cancelled) {
-                rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
+                rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
-                                 LOCKING_ANDX_OPLOCK_RELEASE, false);
+                                 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false);
                cFYI(1, "Oplock release rc = %d", rc);
        }
        /*
         * We might have kicked in before is_valid_oplock_break()
         * finished grabbing reference for us.  Make sure it's done by
-         * waiting for GlobalSMSSeslock.
+         * waiting for cifs_file_list_lock.
         */
-        write_lock(&GlobalSMBSeslock);
+        spin_lock(&cifs_file_list_lock);
-        write_unlock(&GlobalSMBSeslock);
+        spin_unlock(&cifs_file_list_lock);
        cifs_oplock_break_put(cfile);
 }
+/* must be called while holding cifs_file_list_lock */
 void cifs_oplock_break_get(struct cifsFileInfo *cfile)
 {
-        mntget(cfile->mnt);
+        cifs_sb_active(cfile->dentry->d_sb);
        cifsFileInfo_get(cfile);
 }
 void cifs_oplock_break_put(struct cifsFileInfo *cfile)
 {
-        mntput(cfile->mnt);
        cifsFileInfo_put(cfile);
+        cifs_sb_deactive(cfile->dentry->d_sb);
 }
 const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 9f3f5c4be161..a2ad94efcfe6 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -62,15 +62,15 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
 {
        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        if (cifsi->fscache)
                return;
-        cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
+        cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
-                                &cifs_fscache_inode_object_def,
+                                &cifs_fscache_inode_object_def, cifsi);
-                                cifsi);
+        cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
-        cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)",
+                                cifsi->fscache);
-                        cifs_sb->tcon->fscache, cifsi->fscache);
 }
 void cifs_fscache_release_inode_cookie(struct inode *inode)
@@ -117,7 +117,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
                /* retire the current fscache cache and get a new one */
                fscache_relinquish_cookie(cifsi->fscache, 1);
-                cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
+                cifsi->fscache = fscache_acquire_cookie(
+                                        cifs_sb_master_tcon(cifs_sb)->fscache,
                                        &cifs_fscache_inode_object_def,
                                        cifsi);
                cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 53cce8cc2224..ef3a55bf86b6 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -52,7 +52,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
                /* check if server can support readpages */
-                if (cifs_sb->tcon->ses->server->maxBuf <
+                if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
                                PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
                        inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
                else
@@ -288,8 +288,8 @@ int cifs_get_file_info_unix(struct file *filp)
        struct cifs_fattr fattr;
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
        struct cifsFileInfo *cfile = filp->private_data;
+        struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
        xid = GetXid();
        rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -313,15 +313,21 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        FILE_UNIX_BASIC_INFO find_data;
        struct cifs_fattr fattr;
        struct cifsTconInfo *tcon;
+        struct tcon_link *tlink;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        tcon = cifs_sb->tcon;
        cFYI(1, "Getting info on %s", full_path);
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
        /* could have done a find first instead but this returns more info */
        rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
                                  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+        cifs_put_tlink(tlink);
        if (!rc) {
                cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
@@ -332,6 +338,13 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                return rc;
        }
+        /* check for Minshall+French symlinks */
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+                int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+                if (tmprc)
+                        cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+        }
        if (*pinode == NULL) {
                /* get new inode */
                cifs_fill_uniqueid(sb, &fattr);
@@ -353,7 +366,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
        int rc;
        int oplock = 0;
        __u16 netfid;
-        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *tcon;
        char buf[24];
        unsigned int bytes_read;
        char *pbuf;
@@ -372,7 +386,12 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                return -EINVAL;  /* EOPNOTSUPP? */
        }
-        rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
+        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
                         CREATE_NOT_DIR, &netfid, &oplock, NULL,
                         cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags &
@@ -380,7 +399,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
        if (rc == 0) {
                int buf_type = CIFS_NO_BUFFER;
                        /* Read header */
-                rc = CIFSSMBRead(xid, pTcon, netfid,
+                rc = CIFSSMBRead(xid, tcon, netfid,
                                 24 /* length */, 0 /* offset */,
                                 &bytes_read, &pbuf, &buf_type);
                if ((rc == 0) && (bytes_read >= 8)) {
@@ -422,8 +441,9 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                        fattr->cf_dtype = DT_REG;
                        rc = -EOPNOTSUPP; /* or some unknown SFU type */
                }
-                CIFSSMBClose(xid, pTcon, netfid);
+                CIFSSMBClose(xid, tcon, netfid);
        }
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -441,11 +461,19 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
        ssize_t rc;
        char ea_value[4];
        __u32 mode;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
-        rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS",
+        rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS",
                            ea_value, 4 /* size of buf */, cifs_sb->local_nls,
                            cifs_sb->mnt_cifs_flags &
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        cifs_put_tlink(tlink);
        if (rc < 0)
                return (int)rc;
        else if (rc > 3) {
@@ -468,6 +496,8 @@ static void
 cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
                       struct cifs_sb_info *cifs_sb, bool adjust_tz)
 {
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        memset(fattr, 0, sizeof(*fattr));
        fattr->cf_cifsattrs = le32_to_cpu(info->Attributes);
        if (info->DeletePending)
@@ -482,8 +512,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
        fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
        if (adjust_tz) {
-                fattr->cf_ctime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
+                fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj;
-                fattr->cf_mtime.tv_sec += cifs_sb->tcon->ses->server->timeAdj;
+                fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
        }
        fattr->cf_eof = le64_to_cpu(info->EndOfFile);
@@ -515,8 +545,8 @@ int cifs_get_file_info(struct file *filp)
        struct cifs_fattr fattr;
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
        struct cifsFileInfo *cfile = filp->private_data;
+        struct cifsTconInfo *tcon = tlink_tcon(cfile->tlink);
        xid = GetXid();
        rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -554,26 +584,33 @@ int cifs_get_inode_info(struct inode **pinode,
 {
        int rc = 0, tmprc;
        struct cifsTconInfo *pTcon;
+        struct tcon_link *tlink;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        char *buf = NULL;
        bool adjustTZ = false;
        struct cifs_fattr fattr;
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
        cFYI(1, "Getting info on %s", full_path);
        if ((pfindData == NULL) && (*pinode != NULL)) {
                if (CIFS_I(*pinode)->clientCanCacheRead) {
                        cFYI(1, "No need to revalidate cached inode sizes");
-                        return rc;
+                        goto cgii_exit;
                }
        }
        /* if file info not passed in then get it from server */
        if (pfindData == NULL) {
                buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-                if (buf == NULL)
+                if (buf == NULL) {
-                        return -ENOMEM;
+                        rc = -ENOMEM;
+                        goto cgii_exit;
+                }
                pfindData = (FILE_ALL_INFO *)buf;
                /* could do find first instead but this returns more info */
@@ -661,6 +698,13 @@ int cifs_get_inode_info(struct inode **pinode,
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
                cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
+        /* check for Minshall+French symlinks */
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+                tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+                if (tmprc)
+                        cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+        }
        if (!*pinode) {
                *pinode = cifs_iget(sb, &fattr);
                if (!*pinode)
@@ -671,6 +715,7 @@ int cifs_get_inode_info(struct inode **pinode,
 cgii_exit:
        kfree(buf);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -683,6 +728,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
        int pplen = cifs_sb->prepathlen;
        int dfsplen;
        char *full_path = NULL;
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        /* if no prefix path, simply set path to the root of share to "" */
        if (pplen == 0) {
@@ -692,8 +738,8 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
                return full_path;
        }
-        if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS))
+        if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
-                dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1);
+                dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
        else
                dfsplen = 0;
@@ -702,7 +748,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
                return full_path;
        if (dfsplen) {
-                strncpy(full_path, cifs_sb->tcon->treeName, dfsplen);
+                strncpy(full_path, tcon->treeName, dfsplen);
                /* switch slash direction in prepath depending on whether
                 * windows or posix style path names
                 */
@@ -818,18 +864,18 @@ retry_iget5_locked:
 struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 {
        int xid;
-        struct cifs_sb_info *cifs_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct inode *inode = NULL;
        long rc;
        char *full_path;
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
-        cifs_sb = CIFS_SB(sb);
        full_path = cifs_build_path_to_root(cifs_sb);
        if (full_path == NULL)
                return ERR_PTR(-ENOMEM);
        xid = GetXid();
-        if (cifs_sb->tcon->unix_ext)
+        if (tcon->unix_ext)
                rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
        else
                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -840,10 +886,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 #ifdef CONFIG_CIFS_FSCACHE
        /* populate tcon->resource_id */
-        cifs_sb->tcon->resource_id = CIFS_I(inode)->uniqueid;
+        tcon->resource_id = CIFS_I(inode)->uniqueid;
 #endif
-        if (rc && cifs_sb->tcon->ipc) {
+        if (rc && tcon->ipc) {
                cFYI(1, "ipc connection - fake read inode");
                inode->i_mode |= S_IFDIR;
                inode->i_nlink = 2;
@@ -879,7 +925,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
        struct cifsFileInfo *open_file;
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        struct tcon_link *tlink = NULL;
+        struct cifsTconInfo *pTcon;
        FILE_BASIC_INFO info_buf;
        if (attrs == NULL)
@@ -918,13 +965,22 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
        /*
         * If the file is already open for write, just use that fileid
         */
-        open_file = find_writable_file(cifsInode);
+        open_file = find_writable_file(cifsInode, true);
        if (open_file) {
                netfid = open_file->netfid;
                netpid = open_file->pid;
+                pTcon = tlink_tcon(open_file->tlink);
                goto set_via_filehandle;
        }
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
+                rc = PTR_ERR(tlink);
+                tlink = NULL;
+                goto out;
+        }
+        pTcon = tlink_tcon(tlink);
        /*
         * NT4 apparently returns success on this call, but it doesn't
         * really work.
@@ -968,6 +1024,8 @@ set_via_filehandle:
        else
                cifsFileInfo_put(open_file);
 out:
+        if (tlink != NULL)
+                cifs_put_tlink(tlink);
        return rc;
 }
@@ -985,10 +1043,16 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
        struct inode *inode = dentry->d_inode;
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *tcon;
        __u32 dosattr, origattr;
        FILE_BASIC_INFO *info_buf = NULL;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
        rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
                         DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
                         &netfid, &oplock, NULL, cifs_sb->local_nls,
@@ -1057,6 +1121,7 @@ out_close:
        CIFSSMBClose(xid, tcon, netfid);
 out:
        kfree(info_buf);
+        cifs_put_tlink(tlink);
        return rc;
        /*
@@ -1096,12 +1161,18 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
        struct cifsInodeInfo *cifs_inode;
        struct super_block *sb = dir->i_sb;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *tcon;
        struct iattr *attrs = NULL;
        __u32 dosattr = 0, origattr = 0;
        cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
        xid = GetXid();
        /* Unlink can be called from rename so we can not take the
@@ -1109,8 +1180,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
        full_path = build_path_from_dentry(dentry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto unlink_out;
-                return rc;
        }
        if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -1176,10 +1246,11 @@ out_reval:
        dir->i_ctime = dir->i_mtime = current_fs_time(sb);
        cifs_inode = CIFS_I(dir);
        CIFS_I(dir)->time = 0;  /* force revalidate of dir as well */
+unlink_out:
        kfree(full_path);
        kfree(attrs);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -1188,6 +1259,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        int rc = 0, tmprc;
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
@@ -1195,16 +1267,18 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
-        xid = GetXid();
        cifs_sb = CIFS_SB(inode->i_sb);
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
+        xid = GetXid();
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto mkdir_out;
-                return rc;
        }
        if ((pTcon->ses->capabilities & CAP_UNIX) &&
@@ -1362,6 +1436,7 @@ mkdir_get_info:
 mkdir_out:
        kfree(full_path);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -1370,6 +1445,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        int rc = 0;
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
        struct cifsInodeInfo *cifsInode;
@@ -1378,18 +1454,23 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        xid = GetXid();
-        cifs_sb = CIFS_SB(inode->i_sb);
-        pTcon = cifs_sb->tcon;
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto rmdir_exit;
-                return rc;
+        }
+        cifs_sb = CIFS_SB(inode->i_sb);
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
+                rc = PTR_ERR(tlink);
+                goto rmdir_exit;
        }
+        pTcon = tlink_tcon(tlink);
        rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
                          cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        cifs_put_tlink(tlink);
        if (!rc) {
                drop_nlink(inode);
@@ -1410,6 +1491,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
                current_fs_time(inode->i_sb);
+rmdir_exit:
        kfree(full_path);
        FreeXid(xid);
        return rc;
@@ -1420,10 +1502,16 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
                struct dentry *to_dentry, const char *toPath)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
-        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *pTcon;
        __u16 srcfid;
        int oplock, rc;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
        /* try path-based rename first */
        rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
                           cifs_sb->mnt_cifs_flags &
@@ -1435,11 +1523,11 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
         * rename by filehandle to various Windows servers.
         */
        if (rc == 0 || rc != -ETXTBSY)
-                return rc;
+                goto do_rename_exit;
        /* open-file renames don't work across directories */
        if (to_dentry->d_parent != from_dentry->d_parent)
-                return rc;
+                goto do_rename_exit;
        /* open the file to be renamed -- we need DELETE perms */
        rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
@@ -1455,7 +1543,8 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
                CIFSSMBClose(xid, pTcon, srcfid);
        }
+do_rename_exit:
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -1465,13 +1554,17 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
        char *fromName = NULL;
        char *toName = NULL;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *tcon;
        FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
        FILE_UNIX_BASIC_INFO *info_buf_target;
        int xid, rc, tmprc;
        cifs_sb = CIFS_SB(source_dir->i_sb);
-        tcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        tcon = tlink_tcon(tlink);
        xid = GetXid();
@@ -1547,6 +1640,7 @@ cifs_rename_exit:
        kfree(fromName);
        kfree(toName);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -1588,8 +1682,7 @@ cifs_invalidate_mapping(struct inode *inode)
        /* write back any cached data */
        if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
                rc = filemap_write_and_wait(inode->i_mapping);
-                if (rc)
+                mapping_set_error(inode->i_mapping, rc);
-                        cifs_i->write_behind_rc = rc;
        }
        invalidate_remote_inode(inode);
        cifs_fscache_reset_inode_cookie(inode);
@@ -1599,11 +1692,12 @@ int cifs_revalidate_file(struct file *filp)
 {
        int rc = 0;
        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
        if (!cifs_inode_needs_reval(inode))
                goto check_inval;
-        if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
+        if (tlink_tcon(cfile->tlink)->unix_ext)
                rc = cifs_get_file_info_unix(filp);
        else
                rc = cifs_get_file_info(filp);
@@ -1644,7 +1738,7 @@ int cifs_revalidate_dentry(struct dentry *dentry)
                 "jiffies %ld", full_path, inode, inode->i_count.counter,
                 dentry, dentry->d_time, jiffies);
-        if (CIFS_SB(sb)->tcon->unix_ext)
+        if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
                rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
        else
                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
@@ -1660,13 +1754,29 @@ check_inval:
 }
 int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-        struct kstat *stat)
+                 struct kstat *stat)
 {
+        struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        int err = cifs_revalidate_dentry(dentry);
        if (!err) {
                generic_fillattr(dentry->d_inode, stat);
                stat->blksize = CIFS_MAX_MSGSIZE;
                stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
+                /*
+                 * If on a multiuser mount without unix extensions, and the
+                 * admin hasn't overridden them, set the ownership to the
+                 * fsuid/fsgid of the current process.
+                 */
+                if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+                    !tcon->unix_ext) {
+                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
+                                stat->uid = current_fsuid();
+                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
+                                stat->gid = current_fsgid();
+                }
        }
        return err;
 }
@@ -1708,7 +1818,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
        struct cifsFileInfo *open_file;
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        struct tcon_link *tlink = NULL;
+        struct cifsTconInfo *pTcon = NULL;
        /*
         * To avoid spurious oplock breaks from server, in the case of
@@ -1719,10 +1830,11 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
         * writebehind data than the SMB timeout for the SetPathInfo
         * request would allow
         */
-        open_file = find_writable_file(cifsInode);
+        open_file = find_writable_file(cifsInode, true);
        if (open_file) {
                __u16 nfid = open_file->netfid;
                __u32 npid = open_file->pid;
+                pTcon = tlink_tcon(open_file->tlink);
                rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
                                        npid, false);
                cifsFileInfo_put(open_file);
@@ -1737,6 +1849,13 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                rc = -EINVAL;
        if (rc != 0) {
+                if (pTcon == NULL) {
+                        tlink = cifs_sb_tlink(cifs_sb);
+                        if (IS_ERR(tlink))
+                                return PTR_ERR(tlink);
+                        pTcon = tlink_tcon(tlink);
+                }
                /* Set file size by pathname rather than by handle
                   either because no valid, writeable file handle for
                   it was found or because there was an error setting
@@ -1766,6 +1885,8 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                                CIFSSMBClose(xid, pTcon, netfid);
                        }
                }
+                if (tlink)
+                        cifs_put_tlink(tlink);
        }
        if (rc == 0) {
@@ -1786,7 +1907,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
        struct inode *inode = direntry->d_inode;
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *pTcon;
        struct cifs_unix_set_info_args *args = NULL;
        struct cifsFileInfo *open_file;
@@ -1820,10 +1942,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
         * the flush returns error?
         */
        rc = filemap_write_and_wait(inode->i_mapping);
-        if (rc != 0) {
+        mapping_set_error(inode->i_mapping, rc);
-                cifsInode->write_behind_rc = rc;
+        rc = 0;
-                rc = 0;
-        }
        if (attrs->ia_valid & ATTR_SIZE) {
                rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -1873,17 +1993,25 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
                args->ctime = NO_CHANGE_64;
        args->device = 0;
-        open_file = find_writable_file(cifsInode);
+        open_file = find_writable_file(cifsInode, true);
        if (open_file) {
                u16 nfid = open_file->netfid;
                u32 npid = open_file->pid;
+                pTcon = tlink_tcon(open_file->tlink);
                rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
                cifsFileInfo_put(open_file);
        } else {
+                tlink = cifs_sb_tlink(cifs_sb);
+                if (IS_ERR(tlink)) {
+                        rc = PTR_ERR(tlink);
+                        goto out;
+                }
+                pTcon = tlink_tcon(tlink);
                rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
                                    cifs_sb->local_nls,
                                    cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+                cifs_put_tlink(tlink);
        }
        if (rc)
@@ -1956,10 +2084,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
         * the flush returns error?
         */
        rc = filemap_write_and_wait(inode->i_mapping);
-        if (rc != 0) {
+        mapping_set_error(inode->i_mapping, rc);
-                cifsInode->write_behind_rc = rc;
+        rc = 0;
-                rc = 0;
-        }
        if (attrs->ia_valid & ATTR_SIZE) {
                rc = cifs_set_file_size(inode, attrs, xid, full_path);
@@ -2051,7 +2177,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        setattr_copy(inode, attrs);
        mark_inode_dirty(inode);
-        return 0;
 cifs_setattr_exit:
        kfree(full_path);
@@ -2064,7 +2189,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 {
        struct inode *inode = direntry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        struct cifsTconInfo *pTcon = cifs_sb_master_tcon(cifs_sb);
        if (pTcon->unix_ext)
                return cifs_setattr_unix(direntry, attrs);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 9d38a71c8e14..0c98672d0122 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -37,11 +37,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        int xid;
        struct cifs_sb_info *cifs_sb;
 #ifdef CONFIG_CIFS_POSIX
+        struct cifsFileInfo *pSMBFile = filep->private_data;
+        struct cifsTconInfo *tcon;
        __u64   ExtAttrBits = 0;
        __u64   ExtAttrMask = 0;
        __u64   caps;
-        struct cifsTconInfo *tcon;
-        struct cifsFileInfo *pSMBFile = filep->private_data;
 #endif /* CONFIG_CIFS_POSIX */
        xid = GetXid();
@@ -50,17 +50,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        cifs_sb = CIFS_SB(inode->i_sb);
-#ifdef CONFIG_CIFS_POSIX
-        tcon = cifs_sb->tcon;
-        if (tcon)
-                caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
-        else {
-                rc = -EIO;
-                FreeXid(xid);
-                return -EIO;
-        }
-#endif /* CONFIG_CIFS_POSIX */
        switch (command) {
                case CIFS_IOC_CHECKUMOUNT:
                        cFYI(1, "User unmount attempted");
@@ -73,9 +62,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                        break;
 #ifdef CONFIG_CIFS_POSIX
                case FS_IOC_GETFLAGS:
+                        if (pSMBFile == NULL)
+                                break;
+                        tcon = tlink_tcon(pSMBFile->tlink);
+                        caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
                        if (CIFS_UNIX_EXTATTR_CAP & caps) {
-                                if (pSMBFile == NULL)
-                                        break;
                                rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid,
                                        &ExtAttrBits, &ExtAttrMask);
                                if (rc == 0)
@@ -86,13 +77,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                        break;
                case FS_IOC_SETFLAGS:
+                        if (pSMBFile == NULL)
+                                break;
+                        tcon = tlink_tcon(pSMBFile->tlink);
+                        caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
                        if (CIFS_UNIX_EXTATTR_CAP & caps) {
                                if (get_user(ExtAttrBits, (int __user *)arg)) {
                                        rc = -EFAULT;
                                        break;
                                }
-                                if (pSMBFile == NULL)
-                                        break;
                                /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
                                        extAttrBits, &ExtAttrMask);*/
                        }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 473ca8033656..85cdbf831e7b 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,6 +28,296 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#include "md5.h"
+#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
+#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
+#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
+#define CIFS_MF_SYMLINK_LINK_MAXLEN (1024)
+#define CIFS_MF_SYMLINK_FILE_SIZE \
+        (CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN)
+#define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n"
+#define CIFS_MF_SYMLINK_MD5_FORMAT \
+        "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n"
+#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \
+        md5_hash[0],  md5_hash[1],  md5_hash[2],  md5_hash[3], \
+        md5_hash[4],  md5_hash[5],  md5_hash[6],  md5_hash[7], \
+        md5_hash[8],  md5_hash[9],  md5_hash[10], md5_hash[11],\
+        md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
+static int
+CIFSParseMFSymlink(const u8 *buf,
+                   unsigned int buf_len,
+                   unsigned int *_link_len,
+                   char **_link_str)
+{
+        int rc;
+        unsigned int link_len;
+        const char *md5_str1;
+        const char *link_str;
+        struct MD5Context md5_ctx;
+        u8 md5_hash[16];
+        char md5_str2[34];
+        if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
+                return -EINVAL;
+        md5_str1 = (const char *)&buf[CIFS_MF_SYMLINK_MD5_OFFSET];
+        link_str = (const char *)&buf[CIFS_MF_SYMLINK_LINK_OFFSET];
+        rc = sscanf(buf, CIFS_MF_SYMLINK_LEN_FORMAT, &link_len);
+        if (rc != 1)
+                return -EINVAL;
+        cifs_MD5_init(&md5_ctx);
+        cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
+        cifs_MD5_final(md5_hash, &md5_ctx);
+        snprintf(md5_str2, sizeof(md5_str2),
+                 CIFS_MF_SYMLINK_MD5_FORMAT,
+                 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+        if (strncmp(md5_str1, md5_str2, 17) != 0)
+                return -EINVAL;
+        if (_link_str) {
+                *_link_str = kstrndup(link_str, link_len, GFP_KERNEL);
+                if (!*_link_str)
+                        return -ENOMEM;
+        }
+        *_link_len = link_len;
+        return 0;
+}
+static int
+CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
+{
+        unsigned int link_len;
+        unsigned int ofs;
+        struct MD5Context md5_ctx;
+        u8 md5_hash[16];
+        if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
+                return -EINVAL;
+        link_len = strlen(link_str);
+        if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
+                return -ENAMETOOLONG;
+        cifs_MD5_init(&md5_ctx);
+        cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
+        cifs_MD5_final(md5_hash, &md5_ctx);
+        snprintf(buf, buf_len,
+                 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
+                 link_len,
+                 CIFS_MF_SYMLINK_MD5_ARGS(md5_hash));
+        ofs = CIFS_MF_SYMLINK_LINK_OFFSET;
+        memcpy(buf + ofs, link_str, link_len);
+        ofs += link_len;
+        if (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
+                buf[ofs] = '\n';
+                ofs++;
+        }
+        while (ofs < CIFS_MF_SYMLINK_FILE_SIZE) {
+                buf[ofs] = ' ';
+                ofs++;
+        }
+        return 0;
+}
+static int
+CIFSCreateMFSymLink(const int xid, struct cifsTconInfo *tcon,
+                    const char *fromName, const char *toName,
+                    const struct nls_table *nls_codepage, int remap)
+{
+        int rc;
+        int oplock = 0;
+        __u16 netfid = 0;
+        u8 *buf;
+        unsigned int bytes_written = 0;
+        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
+        if (rc != 0) {
+                kfree(buf);
+                return rc;
+        }
+        rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
+                         CREATE_NOT_DIR, &netfid, &oplock, NULL,
+                         nls_codepage, remap);
+        if (rc != 0) {
+                kfree(buf);
+                return rc;
+        }
+        rc = CIFSSMBWrite(xid, tcon, netfid,
+                          CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+                          0 /* offset */,
+                          &bytes_written, buf, NULL, 0);
+        CIFSSMBClose(xid, tcon, netfid);
+        kfree(buf);
+        if (rc != 0)
+                return rc;
+        if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
+                return -EIO;
+        return 0;
+}
+static int
+CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
+                   const unsigned char *searchName, char **symlinkinfo,
+                   const struct nls_table *nls_codepage, int remap)
+{
+        int rc;
+        int oplock = 0;
+        __u16 netfid = 0;
+        u8 *buf;
+        char *pbuf;
+        unsigned int bytes_read = 0;
+        int buf_type = CIFS_NO_BUFFER;
+        unsigned int link_len = 0;
+        FILE_ALL_INFO file_info;
+        rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
+                         CREATE_NOT_DIR, &netfid, &oplock, &file_info,
+                         nls_codepage, remap);
+        if (rc != 0)
+                return rc;
+        if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
+                CIFSSMBClose(xid, tcon, netfid);
+                /* it's not a symlink */
+                return -EINVAL;
+        }
+        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        pbuf = buf;
+        rc = CIFSSMBRead(xid, tcon, netfid,
+                         CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+                         0 /* offset */,
+                         &bytes_read, &pbuf, &buf_type);
+        CIFSSMBClose(xid, tcon, netfid);
+        if (rc != 0) {
+                kfree(buf);
+                return rc;
+        }
+        rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
+        kfree(buf);
+        if (rc != 0)
+                return rc;
+        return 0;
+}
+bool
+CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
+{
+        if (!(fattr->cf_mode & S_IFREG))
+                /* it's not a symlink */
+                return false;
+        if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
+                /* it's not a symlink */
+                return false;
+        return true;
+}
+int
+CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+                   const unsigned char *path,
+                   struct cifs_sb_info *cifs_sb, int xid)
+{
+        int rc;
+        int oplock = 0;
+        __u16 netfid = 0;
+        struct tcon_link *tlink;
+        struct cifsTconInfo *pTcon;
+        u8 *buf;
+        char *pbuf;
+        unsigned int bytes_read = 0;
+        int buf_type = CIFS_NO_BUFFER;
+        unsigned int link_len = 0;
+        FILE_ALL_INFO file_info;
+        if (!CIFSCouldBeMFSymlink(fattr))
+                /* it's not a symlink */
+                return 0;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
+        rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
+                         CREATE_NOT_DIR, &netfid, &oplock, &file_info,
+                         cifs_sb->local_nls,
+                         cifs_sb->mnt_cifs_flags &
+                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc != 0)
+                goto out;
+        if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
+                CIFSSMBClose(xid, pTcon, netfid);
+                /* it's not a symlink */
+                goto out;
+        }
+        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+        if (!buf) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        pbuf = buf;
+        rc = CIFSSMBRead(xid, pTcon, netfid,
+                         CIFS_MF_SYMLINK_FILE_SIZE /* length */,
+                         0 /* offset */,
+                         &bytes_read, &pbuf, &buf_type);
+        CIFSSMBClose(xid, pTcon, netfid);
+        if (rc != 0) {
+                kfree(buf);
+                goto out;
+        }
+        rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
+        kfree(buf);
+        if (rc == -EINVAL) {
+                /* it's not a symlink */
+                rc = 0;
+                goto out;
+        }
+        if (rc != 0)
+                goto out;
+        /* it is a symlink */
+        fattr->cf_eof = link_len;
+        fattr->cf_mode &= ~S_IFMT;
+        fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
+        fattr->cf_dtype = DT_LNK;
+out:
+        cifs_put_tlink(tlink);
+        return rc;
+}
 int
 cifs_hardlink(struct dentry *old_file, struct inode *inode,
@@ -37,17 +327,17 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
        int xid;
        char *fromName = NULL;
        char *toName = NULL;
-        struct cifs_sb_info *cifs_sb_target;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        struct cifsInodeInfo *cifsInode;
-        xid = GetXid();
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
-        cifs_sb_target = CIFS_SB(inode->i_sb);
+                return PTR_ERR(tlink);
-        pTcon = cifs_sb_target->tcon;
+        pTcon = tlink_tcon(tlink);
-/* No need to check for cross device links since server will do that
+        xid = GetXid();
-   BB note DFS case in future though (when we may have to check) */
        fromName = build_path_from_dentry(old_file);
        toName = build_path_from_dentry(direntry);
@@ -56,16 +346,15 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
                goto cifs_hl_exit;
        }
-/*      if (cifs_sb_target->tcon->ses->capabilities & CAP_UNIX)*/
        if (pTcon->unix_ext)
                rc = CIFSUnixCreateHardLink(xid, pTcon, fromName, toName,
-                                            cifs_sb_target->local_nls,
+                                            cifs_sb->local_nls,
-                                            cifs_sb_target->mnt_cifs_flags &
+                                            cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        else {
                rc = CIFSCreateHardLink(xid, pTcon, fromName, toName,
-                                        cifs_sb_target->local_nls,
+                                        cifs_sb->local_nls,
-                                        cifs_sb_target->mnt_cifs_flags &
+                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                if ((rc == -EIO) || (rc == -EINVAL))
                        rc = -EOPNOTSUPP;
@@ -101,6 +390,7 @@ cifs_hl_exit:
        kfree(fromName);
        kfree(toName);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -113,10 +403,19 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
        char *full_path = NULL;
        char *target_path = NULL;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct tcon_link *tlink = NULL;
+        struct cifsTconInfo *tcon;
        xid = GetXid();
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink)) {
+                rc = PTR_ERR(tlink);
+                tlink = NULL;
+                goto out;
+        }
+        tcon = tlink_tcon(tlink);
        /*
         * For now, we just handle symlinks with unix extensions enabled.
         * Eventually we should handle NTFS reparse points, and MacOS
@@ -130,7 +429,8 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
         * but there doesn't seem to be any harm in allowing the client to
         * read them.
         */
-        if (!(tcon->ses->capabilities & CAP_UNIX)) {
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+            && !(tcon->ses->capabilities & CAP_UNIX)) {
                rc = -EACCES;
                goto out;
        }
@@ -141,8 +441,21 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
        cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
-        rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
+        rc = -EACCES;
-                                     cifs_sb->local_nls);
+        /*
+         * First try Minshall+French Symlinks, if configured
+         * and fallback to UNIX Extensions Symlinks.
+         */
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+                rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path,
+                                        cifs_sb->local_nls,
+                                        cifs_sb->mnt_cifs_flags &
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if ((rc != 0) && (tcon->ses->capabilities & CAP_UNIX))
+                rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
+                                             cifs_sb->local_nls);
        kfree(full_path);
 out:
        if (rc != 0) {
@@ -151,6 +464,8 @@ out:
        }
        FreeXid(xid);
+        if (tlink)
+                cifs_put_tlink(tlink);
        nd_set_link(nd, target_path);
        return NULL;
 }
@@ -160,29 +475,37 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 {
        int rc = -EOPNOTSUPP;
        int xid;
-        struct cifs_sb_info *cifs_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
        xid = GetXid();
-        cifs_sb = CIFS_SB(inode->i_sb);
+        tlink = cifs_sb_tlink(cifs_sb);
-        pTcon = cifs_sb->tcon;
+        if (IS_ERR(tlink)) {
+                rc = PTR_ERR(tlink);
+                goto symlink_exit;
+        }
+        pTcon = tlink_tcon(tlink);
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto symlink_exit;
-                return rc;
        }
        cFYI(1, "Full path: %s", full_path);
        cFYI(1, "symname is %s", symname);
        /* BB what if DFS and this volume is on different share? BB */
-        if (pTcon->unix_ext)
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+                rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
+                                         cifs_sb->local_nls,
+                                         cifs_sb->mnt_cifs_flags &
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        else if (pTcon->unix_ext)
                rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
                                           cifs_sb->local_nls);
        /* else
@@ -208,8 +531,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                        d_instantiate(direntry, newinode);
                }
        }
+symlink_exit:
        kfree(full_path);
+        cifs_put_tlink(tlink);
        FreeXid(xid);
        return rc;
 }
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3ccadc1326d6..43f10281bc19 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -347,7 +347,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                                if (current_fsuid() != treeCon->ses->linux_uid) {
                                        cFYI(1, "Multiuser mode and UID "
                                                 "did not match tcon uid");
-                                        read_lock(&cifs_tcp_ses_lock);
+                                        spin_lock(&cifs_tcp_ses_lock);
                                        list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
                                                ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
                                                if (ses->linux_uid == current_fsuid()) {
@@ -361,7 +361,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                                                        }
                                                }
                                        }
-                                        read_unlock(&cifs_tcp_ses_lock);
+                                        spin_unlock(&cifs_tcp_ses_lock);
                                }
                        }
                }
@@ -551,7 +551,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                return false;
        /* look up tcon based on tid & uid */
-        read_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        list_for_each(tmp, &srv->smb_ses_list) {
                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
                list_for_each(tmp1, &ses->tcon_list) {
@@ -560,51 +560,40 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                                continue;
                        cifs_stats_inc(&tcon->num_oplock_brks);
-                        read_lock(&GlobalSMBSeslock);
+                        spin_lock(&cifs_file_list_lock);
                        list_for_each(tmp2, &tcon->openFileList) {
                                netfile = list_entry(tmp2, struct cifsFileInfo,
                                                     tlist);
                                if (pSMB->Fid != netfile->netfid)
                                        continue;
-                                /*
-                                 * don't do anything if file is about to be
-                                 * closed anyway.
-                                 */
-                                if (netfile->closePend) {
-                                        read_unlock(&GlobalSMBSeslock);
-                                        read_unlock(&cifs_tcp_ses_lock);
-                                        return true;
-                                }
                                cFYI(1, "file id match, oplock break");
-                                pCifsInode = CIFS_I(netfile->pInode);
+                                pCifsInode = CIFS_I(netfile->dentry->d_inode);
-                                pCifsInode->clientCanCacheAll = false;
-                                if (pSMB->OplockLevel == 0)
-                                        pCifsInode->clientCanCacheRead = false;
+                                cifs_set_oplock_level(pCifsInode,
+                                                      pSMB->OplockLevel);
                                /*
                                 * cifs_oplock_break_put() can't be called
                                 * from here.  Get reference after queueing
                                 * succeeded.  cifs_oplock_break() will
-                                 * synchronize using GlobalSMSSeslock.
+                                 * synchronize using cifs_file_list_lock.
                                 */
                                if (queue_work(system_nrt_wq,
                                               &netfile->oplock_break))
                                        cifs_oplock_break_get(netfile);
                                netfile->oplock_break_cancelled = false;
-                                read_unlock(&GlobalSMBSeslock);
+                                spin_unlock(&cifs_file_list_lock);
-                                read_unlock(&cifs_tcp_ses_lock);
+                                spin_unlock(&cifs_tcp_ses_lock);
                                return true;
                        }
-                        read_unlock(&GlobalSMBSeslock);
+                        spin_unlock(&cifs_file_list_lock);
-                        read_unlock(&cifs_tcp_ses_lock);
+                        spin_unlock(&cifs_tcp_ses_lock);
                        cFYI(1, "No matching file for oplock break");
                        return true;
                }
        }
-        read_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        cFYI(1, "Can not process oplock break for non-existent connection");
        return true;
 }
@@ -729,6 +718,26 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
                           "properly. Hardlinks will not be recognized on this "
                           "mount. Consider mounting with the \"noserverino\" "
                           "option to silence this message.",
-                           cifs_sb->tcon->treeName);
+                           cifs_sb_master_tcon(cifs_sb)->treeName);
+        }
+}
+void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
+{
+        oplock &= 0xF;
+        if (oplock == OPLOCK_EXCLUSIVE) {
+                cinode->clientCanCacheAll = true;
+                cinode->clientCanCacheRead = true;
+                cFYI(1, "Exclusive Oplock granted on inode %p",
+                     &cinode->vfs_inode);
+        } else if (oplock == OPLOCK_READ) {
+                cinode->clientCanCacheAll = false;
+                cinode->clientCanCacheRead = true;
+                cFYI(1, "Level II Oplock granted on inode %p",
+                    &cinode->vfs_inode);
+        } else {
+                cinode->clientCanCacheAll = false;
+                cinode->clientCanCacheRead = false;
        }
 }
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 49c9a4e75319..5d52e4a3b1ed 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -61,6 +61,21 @@
 #define NTLMSSP_NEGOTIATE_KEY_XCH   0x40000000
 #define NTLMSSP_NEGOTIATE_56        0x80000000
+/* Define AV Pair Field IDs */
+enum av_field_type {
+        NTLMSSP_AV_EOL = 0,
+        NTLMSSP_AV_NB_COMPUTER_NAME,
+        NTLMSSP_AV_NB_DOMAIN_NAME,
+        NTLMSSP_AV_DNS_COMPUTER_NAME,
+        NTLMSSP_AV_DNS_DOMAIN_NAME,
+        NTLMSSP_AV_DNS_TREE_NAME,
+        NTLMSSP_AV_FLAGS,
+        NTLMSSP_AV_TIMESTAMP,
+        NTLMSSP_AV_RESTRICTION,
+        NTLMSSP_AV_TARGET_NAME,
+        NTLMSSP_AV_CHANNEL_BINDINGS
+};
 /* Although typedefs are not commonly used for structure definitions */
 /* in the Linux kernel, in this particular case they are useful      */
 /* to more closely match the standards document for NTLMSSP from     */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index d5e591fab475..ef7bb7b50f58 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -102,7 +102,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
                return NULL;
        }
-        if (CIFS_SB(sb)->tcon->nocase)
+        if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
                dentry->d_op = &cifs_ci_dentry_ops;
        else
                dentry->d_op = &cifs_dentry_ops;
@@ -171,7 +171,7 @@ static void
 cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
                       struct cifs_sb_info *cifs_sb)
 {
-        int offset = cifs_sb->tcon->ses->server->timeAdj;
+        int offset = cifs_sb_master_tcon(cifs_sb)->ses->server->timeAdj;
        memset(fattr, 0, sizeof(*fattr));
        fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate,
@@ -199,7 +199,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
        int len;
        int oplock = 0;
        int rc;
-        struct cifsTconInfo *ptcon = cifs_sb->tcon;
+        struct cifsTconInfo *ptcon = cifs_sb_tcon(cifs_sb);
        char *tmpbuffer;
        rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
@@ -223,34 +223,35 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
 static int initiate_cifs_search(const int xid, struct file *file)
 {
        int rc = 0;
-        char *full_path;
+        char *full_path = NULL;
        struct cifsFileInfo *cifsFile;
-        struct cifs_sb_info *cifs_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
-        if (file->private_data == NULL) {
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
+        if (file->private_data == NULL)
                file->private_data =
                        kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+        if (file->private_data == NULL) {
+                rc = -ENOMEM;
+                goto error_exit;
        }
-        if (file->private_data == NULL)
-                return -ENOMEM;
        cifsFile = file->private_data;
        cifsFile->invalidHandle = true;
        cifsFile->srch_inf.endOfSearch = false;
+        cifsFile->tlink = cifs_get_tlink(tlink);
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        if (cifs_sb == NULL)
-                return -EINVAL;
-        pTcon = cifs_sb->tcon;
-        if (pTcon == NULL)
-                return -EINVAL;
        full_path = build_path_from_dentry(file->f_path.dentry);
+        if (full_path == NULL) {
-        if (full_path == NULL)
+                rc = -ENOMEM;
-                return -ENOMEM;
+                goto error_exit;
+        }
        cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
@@ -283,7 +284,9 @@ ffirst_retry:
                cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
                goto ffirst_retry;
        }
+error_exit:
        kfree(full_path);
+        cifs_put_tlink(tlink);
        return rc;
 }
@@ -525,14 +528,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
           (index_to_find < first_entry_in_buffer)) {
                /* close and restart search */
                cFYI(1, "search backing up - close and restart search");
-                write_lock(&GlobalSMBSeslock);
+                spin_lock(&cifs_file_list_lock);
                if (!cifsFile->srch_inf.endOfSearch &&
                    !cifsFile->invalidHandle) {
                        cifsFile->invalidHandle = true;
-                        write_unlock(&GlobalSMBSeslock);
+                        spin_unlock(&cifs_file_list_lock);
                        CIFSFindClose(xid, pTcon, cifsFile->netfid);
                } else
-                        write_unlock(&GlobalSMBSeslock);
+                        spin_unlock(&cifs_file_list_lock);
                if (cifsFile->srch_inf.ntwrk_buf_start) {
                        cFYI(1, "freeing SMB ff cache buf on search rewind");
                        if (cifsFile->srch_inf.smallBuf)
@@ -738,6 +741,15 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
                cifs_autodisable_serverino(cifs_sb);
        }
+        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
+            CIFSCouldBeMFSymlink(&fattr))
+                /*
+                 * trying to get the type and mode can be slow,
+                 * so just call those regular files for now, and mark
+                 * for reval
+                 */
+                fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
        ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
        tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
@@ -777,9 +789,17 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        xid = GetXid();
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
-        if (pTcon == NULL)
+        /*
-                return -EINVAL;
+         * Ensure FindFirst doesn't fail before doing filldir() for '.' and
+         * '..'. Otherwise we won't be able to notify VFS in case of failure.
+         */
+        if (file->private_data == NULL) {
+                rc = initiate_cifs_search(xid, file);
+                cFYI(1, "initiate cifs search rc %d", rc);
+                if (rc)
+                        goto rddir2_exit;
+        }
        switch ((int) file->f_pos) {
        case 0:
@@ -805,14 +825,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                        if after then keep searching till find it */
                if (file->private_data == NULL) {
-                        rc = initiate_cifs_search(xid, file);
-                        cFYI(1, "initiate cifs search rc %d", rc);
-                        if (rc) {
-                                FreeXid(xid);
-                                return rc;
-                        }
-                }
-                if (file->private_data == NULL) {
                        rc = -EINVAL;
                        FreeXid(xid);
                        return rc;
@@ -829,6 +841,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                        CIFSFindClose(xid, pTcon, cifsFile->netfid);
                } */
+                pTcon = tlink_tcon(cifsFile->tlink);
                rc = find_cifs_entry(xid, pTcon, file,
                                &current_entry, &num_to_fill);
                if (rc) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 0a57cb7db5dd..7b01d3f6eed6 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -32,9 +32,6 @@
 #include <linux/slab.h>
 #include "cifs_spnego.h"
-extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
-                         unsigned char *p24);
 /*
 * Checks if this is the first smb session to be reconnected after
 * the socket has been reestablished (so we know whether to use vc 0).
@@ -80,7 +77,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
        if (max_vcs < 2)
                max_vcs = 0xFFFF;
-        write_lock(&cifs_tcp_ses_lock);
+        spin_lock(&cifs_tcp_ses_lock);
        if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
                        goto get_vc_num_exit;  /* vcnum will be zero */
        for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
@@ -112,7 +109,7 @@ static __le16 get_next_vcnum(struct cifsSesInfo *ses)
                vcnum = i;
        ses->vcnum = vcnum;
 get_vc_num_exit:
-        write_unlock(&cifs_tcp_ses_lock);
+        spin_unlock(&cifs_tcp_ses_lock);
        return cpu_to_le16(vcnum);
 }
@@ -383,6 +380,9 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
 static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
                                    struct cifsSesInfo *ses)
 {
+        unsigned int tioffset; /* challenge message target info area */
+        unsigned int tilen; /* challenge message target info area length  */
        CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
        if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
@@ -399,11 +399,23 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
                return -EINVAL;
        }
-        memcpy(ses->server->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
+        memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
        /* BB we could decode pblob->NegotiateFlags; some may be useful */
        /* In particular we can examine sign flags */
        /* BB spec says that if AvId field of MsvAvTimestamp is populated then
                we must set the MIC field of the AUTHENTICATE_MESSAGE */
+        ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
+        tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
+        tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
+        if (tilen) {
+                ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
+                if (!ses->auth_key.response) {
+                        cERROR(1, "Challenge target info allocation failure");
+                        return -ENOMEM;
+                }
+                memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen);
+                ses->auth_key.len = tilen;
+        }
        return 0;
 }
@@ -425,12 +437,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
        /* BB is NTLMV2 session security format easier to use here? */
        flags = NTLMSSP_NEGOTIATE_56 |  NTLMSSP_REQUEST_TARGET |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-                NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+                NTLMSSP_NEGOTIATE_NTLM;
        if (ses->server->secMode &
-           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                flags |= NTLMSSP_NEGOTIATE_SIGN;
-        if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
+                if (!ses->server->session_estab)
-                flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+                        flags |= NTLMSSP_NEGOTIATE_KEY_XCH |
+                                NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+        }
        sec_blob->NegotiateFlags |= cpu_to_le32(flags);
@@ -448,13 +462,14 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
   maximum possible size is fixed and small, making this approach cleaner.
   This function returns the length of the data in the blob */
 static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
+                                        u16 *buflen,
                                   struct cifsSesInfo *ses,
-                                   const struct nls_table *nls_cp, bool first)
+                                   const struct nls_table *nls_cp)
 {
+        int rc;
        AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
        __u32 flags;
        unsigned char *tmp;
-        char ntlm_session_key[CIFS_SESS_KEY_SIZE];
        memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
        sec_blob->MessageType = NtLmAuthenticate;
@@ -462,7 +477,7 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        flags = NTLMSSP_NEGOTIATE_56 |
                NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-                NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+                NTLMSSP_NEGOTIATE_NTLM;
        if (ses->server->secMode &
           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                flags |= NTLMSSP_NEGOTIATE_SIGN;
@@ -477,19 +492,20 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        sec_blob->LmChallengeResponse.Length = 0;
        sec_blob->LmChallengeResponse.MaximumLength = 0;
-        /* calculate session key,  BB what about adding similar ntlmv2 path? */
-        SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
-        if (first)
-                cifs_calculate_mac_key(&ses->server->mac_signing_key,
-                                       ntlm_session_key, ses->password);
-        memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
        sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
-        sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+        rc = setup_ntlmv2_rsp(ses, nls_cp);
-        sec_blob->NtChallengeResponse.MaximumLength =
+        if (rc) {
-                                cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                cERROR(1, "Error %d during NTLMSSP authentication", rc);
+                goto setup_ntlmv2_ret;
+        }
+        memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+                        ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+        tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
-        tmp += CIFS_SESS_KEY_SIZE;
+        sec_blob->NtChallengeResponse.Length =
+                        cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+        sec_blob->NtChallengeResponse.MaximumLength =
+                        cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
        if (ses->domainName == NULL) {
                sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -501,7 +517,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
                                    MAX_USERNAME_SIZE, nls_cp);
                len *= 2; /* unicode is 2 bytes each */
-                len += 2; /* trailing null */
                sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->DomainName.Length = cpu_to_le16(len);
                sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
@@ -518,7 +533,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
                                    MAX_USERNAME_SIZE, nls_cp);
                len *= 2; /* unicode is 2 bytes each */
-                len += 2; /* trailing null */
                sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->UserName.Length = cpu_to_le16(len);
                sec_blob->UserName.MaximumLength = cpu_to_le16(len);
@@ -530,10 +544,23 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
        sec_blob->WorkstationName.MaximumLength = 0;
        tmp += 2;
-        sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+        if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
-        sec_blob->SessionKey.Length = 0;
+                        !calc_seckey(ses)) {
-        sec_blob->SessionKey.MaximumLength = 0;
+                memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
-        return tmp - pbuffer;
+                sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+                sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
+                sec_blob->SessionKey.MaximumLength =
+                                cpu_to_le16(CIFS_CPHTXT_SIZE);
+                tmp += CIFS_CPHTXT_SIZE;
+        } else {
+                sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+                sec_blob->SessionKey.Length = 0;
+                sec_blob->SessionKey.MaximumLength = 0;
+        }
+setup_ntlmv2_ret:
+        *buflen = tmp - pbuffer;
+        return rc;
 }
@@ -545,19 +572,6 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
        return;
 }
-static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
-                                  struct cifsSesInfo *ses,
-                                  const struct nls_table *nls, bool first_time)
-{
-        int bloblen;
-        bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
-                                          first_time);
-        pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
-        return bloblen;
-}
 #endif
 int
@@ -579,18 +593,23 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
        int bytes_remaining;
        struct key *spnego_key = NULL;
        __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
-        bool first_time;
+        u16 blob_len;
+        char *ntlmsspblob = NULL;
        if (ses == NULL)
                return -EINVAL;
-        read_lock(&cifs_tcp_ses_lock);
-        first_time = is_first_ses_reconnect(ses);
-        read_unlock(&cifs_tcp_ses_lock);
        type = ses->server->secType;
        cFYI(1, "sess setup type %d", type);
+        if (type == RawNTLMSSP) {
+                /* if memory allocation is successful, caller of this function
+                 * frees it.
+                 */
+                ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+                if (!ses->ntlmssp)
+                        return -ENOMEM;
+        }
 ssetup_ntlmssp_authenticate:
        if (phase == NtLmChallenge)
                phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -655,10 +674,14 @@ ssetup_ntlmssp_authenticate:
                /* no capabilities flags in old lanman negotiation */
                pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
-                /* BB calculate hash with password */
-                /* and copy into bcc */
-                calc_lanman_hash(ses->password, ses->server->cryptKey,
+                /* Calculate hash with password and copy into bcc_ptr.
+                 * Encryption Key (stored as in cryptkey) gets used if the
+                 * security mode bit in Negottiate Protocol response states
+                 * to use challenge/response method (i.e. Password bit is 1).
+                 */
+                calc_lanman_hash(ses->password, ses->server->cryptkey,
                                 ses->server->secMode & SECMODE_PW_ENCRYPT ?
                                        true : false, lnm_session_key);
@@ -676,28 +699,27 @@ ssetup_ntlmssp_authenticate:
                ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif
        } else if (type == NTLM) {
-                char ntlm_session_key[CIFS_SESS_KEY_SIZE];
                pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
                pSMB->req_no_secext.CaseInsensitivePasswordLength =
-                        cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                        cpu_to_le16(CIFS_AUTH_RESP_SIZE);
                pSMB->req_no_secext.CaseSensitivePasswordLength =
-                        cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                        cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-                /* calculate session key */
+                /* calculate ntlm response and session key */
-                SMBNTencrypt(ses->password, ses->server->cryptKey,
+                rc = setup_ntlm_response(ses);
-                             ntlm_session_key);
+                if (rc) {
+                        cERROR(1, "Error %d during NTLM authentication", rc);
+                        goto ssetup_exit;
+                }
-                if (first_time) /* should this be moved into common code
+                /* copy ntlm response */
-                                  with similar ntlmv2 path? */
+                memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
-                        cifs_calculate_mac_key(&ses->server->mac_signing_key,
+                                CIFS_AUTH_RESP_SIZE);
-                                ntlm_session_key, ses->password);
+                bcc_ptr += CIFS_AUTH_RESP_SIZE;
-                /* copy session key */
+                memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+                                CIFS_AUTH_RESP_SIZE);
+                bcc_ptr += CIFS_AUTH_RESP_SIZE;
-                memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
-                bcc_ptr += CIFS_SESS_KEY_SIZE;
-                memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE);
-                bcc_ptr += CIFS_SESS_KEY_SIZE;
                if (ses->capabilities & CAP_UNICODE) {
                        /* unicode strings must be word aligned */
                        if (iov[0].iov_len % 2) {
@@ -708,33 +730,27 @@ ssetup_ntlmssp_authenticate:
                } else
                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
        } else if (type == NTLMv2) {
-                char *v2_sess_key =
-                        kmalloc(sizeof(struct ntlmv2_resp), GFP_KERNEL);
-                /* BB FIXME change all users of v2_sess_key to
-                   struct ntlmv2_resp */
-                if (v2_sess_key == NULL) {
-                        rc = -ENOMEM;
-                        goto ssetup_exit;
-                }
                pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
                /* LM2 password would be here if we supported it */
                pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
-                /*      cpu_to_le16(LM2_SESS_KEY_SIZE); */
+                /* calculate nlmv2 response and session key */
+                rc = setup_ntlmv2_rsp(ses, nls_cp);
+                if (rc) {
+                        cERROR(1, "Error %d during NTLMv2 authentication", rc);
+                        goto ssetup_exit;
+                }
+                memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+                                ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+                bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+                /* set case sensitive password length after tilen may get
+                 * assigned, tilen is 0 otherwise.
+                 */
                pSMB->req_no_secext.CaseSensitivePasswordLength =
-                        cpu_to_le16(sizeof(struct ntlmv2_resp));
+                        cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
-                /* calculate session key */
-                setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
-                /* FIXME: calculate MAC key */
-                memcpy(bcc_ptr, (char *)v2_sess_key,
-                       sizeof(struct ntlmv2_resp));
-                bcc_ptr += sizeof(struct ntlmv2_resp);
-                kfree(v2_sess_key);
                if (ses->capabilities & CAP_UNICODE) {
                        if (iov[0].iov_len % 2) {
                                *bcc_ptr = 0;
@@ -746,6 +762,7 @@ ssetup_ntlmssp_authenticate:
        } else if (type == Kerberos) {
 #ifdef CONFIG_CIFS_UPCALL
                struct cifs_spnego_msg *msg;
                spnego_key = cifs_get_spnego_key(ses);
                if (IS_ERR(spnego_key)) {
                        rc = PTR_ERR(spnego_key);
@@ -763,19 +780,17 @@ ssetup_ntlmssp_authenticate:
                        rc = -EKEYREJECTED;
                        goto ssetup_exit;
                }
-                /* bail out if key is too long */
-                if (msg->sesskey_len >
+                ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL);
-                    sizeof(ses->server->mac_signing_key.data.krb5)) {
+                if (!ses->auth_key.response) {
-                        cERROR(1, "Kerberos signing key too long (%u bytes)",
+                        cERROR(1, "Kerberos can't allocate (%u bytes) memory",
-                                msg->sesskey_len);
+                                        msg->sesskey_len);
-                        rc = -EOVERFLOW;
+                        rc = -ENOMEM;
                        goto ssetup_exit;
                }
-                if (first_time) {
+                memcpy(ses->auth_key.response, msg->data, msg->sesskey_len);
-                        ses->server->mac_signing_key.len = msg->sesskey_len;
+                ses->auth_key.len = msg->sesskey_len;
-                        memcpy(ses->server->mac_signing_key.data.krb5,
-                                msg->data, msg->sesskey_len);
-                }
                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
                capabilities |= CAP_EXTENDED_SECURITY;
                pSMB->req.Capabilities = cpu_to_le32(capabilities);
@@ -815,12 +830,30 @@ ssetup_ntlmssp_authenticate:
                        if (phase == NtLmNegotiate) {
                                setup_ntlmssp_neg_req(pSMB, ses);
                                iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+                                iov[1].iov_base = &pSMB->req.SecurityBlob[0];
                        } else if (phase == NtLmAuthenticate) {
-                                int blob_len;
+                                /* 5 is an empirical value, large enought to
-                                blob_len = setup_ntlmssp_auth_req(pSMB, ses,
+                                 * hold authenticate message, max 10 of
-                                                                  nls_cp,
+                                 * av paris, doamin,user,workstation mames,
-                                                                  first_time);
+                                 * flags etc..
+                                 */
+                                ntlmsspblob = kmalloc(
+                                        5*sizeof(struct _AUTHENTICATE_MESSAGE),
+                                        GFP_KERNEL);
+                                if (!ntlmsspblob) {
+                                        cERROR(1, "Can't allocate NTLMSSP");
+                                        rc = -ENOMEM;
+                                        goto ssetup_exit;
+                                }
+                                rc = build_ntlmssp_auth_blob(ntlmsspblob,
+                                                        &blob_len, ses, nls_cp);
+                                if (rc)
+                                        goto ssetup_exit;
                                iov[1].iov_len = blob_len;
+                                iov[1].iov_base = ntlmsspblob;
+                                pSMB->req.SecurityBlobLength =
+                                        cpu_to_le16(blob_len);
                                /* Make sure that we tell the server that we
                                   are using the uid that it just gave us back
                                   on the response (challenge) */
@@ -830,7 +863,6 @@ ssetup_ntlmssp_authenticate:
                                rc = -ENOSYS;
                                goto ssetup_exit;
                        }
-                        iov[1].iov_base = &pSMB->req.SecurityBlob[0];
                        /* unicode strings must be word aligned */
                        if ((iov[0].iov_len + iov[1].iov_len) % 2) {
                                *bcc_ptr = 0;
@@ -861,8 +893,6 @@ ssetup_ntlmssp_authenticate:
                          CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
        /* SMB request buf freed in SendReceive2 */
-        cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
        pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
        smb_buf = (struct smb_hdr *)iov[0].iov_base;
@@ -895,7 +925,6 @@ ssetup_ntlmssp_authenticate:
        bcc_ptr = pByteArea(smb_buf);
        if (smb_buf->WordCount == 4) {
-                __u16 blob_len;
                blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
                if (blob_len > bytes_remaining) {
                        cERROR(1, "bad security blob length %d", blob_len);
@@ -931,6 +960,8 @@ ssetup_exit:
                key_put(spnego_key);
        }
        kfree(str_area);
+        kfree(ntlmsspblob);
+        ntlmsspblob = NULL;
        if (resp_buf_type == CIFS_SMALL_BUFFER) {
                cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
                cifs_small_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 82f78c4d6978..e0588cdf4cc5 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
                                             SECMODE_SIGN_ENABLED))) {
                        rc = cifs_verify_signature(midQ->resp_buf,
-                                                &ses->server->mac_signing_key,
+                                                ses->server,
                                                midQ->sequence_number+1);
                        if (rc) {
                                cERROR(1, "Unexpected SMB signature");
@@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
                                             SECMODE_SIGN_ENABLED))) {
                        rc = cifs_verify_signature(out_buf,
-                                                &ses->server->mac_signing_key,
+                                                ses->server,
                                                midQ->sequence_number+1);
                        if (rc) {
                                cERROR(1, "Unexpected SMB signature");
@@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
            (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
                                     SECMODE_SIGN_ENABLED))) {
                rc = cifs_verify_signature(out_buf,
-                                           &ses->server->mac_signing_key,
+                                           ses->server,
                                           midQ->sequence_number+1);
                if (rc) {
                        cERROR(1, "Unexpected SMB signature");
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a1509207bfa6..a264b744bb41 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -47,9 +47,10 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 #ifdef CONFIG_CIFS_XATTR
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        struct super_block *sb;
-        char *full_path;
+        char *full_path = NULL;
        if (direntry == NULL)
                return -EIO;
@@ -58,16 +59,19 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
        sb = direntry->d_inode->i_sb;
        if (sb == NULL)
                return -EIO;
-        xid = GetXid();
        cifs_sb = CIFS_SB(sb);
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
+        xid = GetXid();
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto remove_ea_exit;
-                return rc;
        }
        if (ea_name == NULL) {
                cFYI(1, "Null xattr names not supported");
@@ -91,6 +95,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 remove_ea_exit:
        kfree(full_path);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
 #endif
        return rc;
 }
@@ -102,6 +107,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 #ifdef CONFIG_CIFS_XATTR
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        struct super_block *sb;
        char *full_path;
@@ -113,16 +119,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
        sb = direntry->d_inode->i_sb;
        if (sb == NULL)
                return -EIO;
-        xid = GetXid();
        cifs_sb = CIFS_SB(sb);
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
+        xid = GetXid();
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto set_ea_exit;
-                return rc;
        }
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
@@ -132,9 +141,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                returns as xattrs */
        if (value_size > MAX_EA_VALUE_SIZE) {
                cFYI(1, "size of EA value too large");
-                kfree(full_path);
+                rc = -EOPNOTSUPP;
-                FreeXid(xid);
+                goto set_ea_exit;
-                return -EOPNOTSUPP;
        }
        if (ea_name == NULL) {
@@ -198,6 +206,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 set_ea_exit:
        kfree(full_path);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
 #endif
        return rc;
 }
@@ -209,6 +218,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 #ifdef CONFIG_CIFS_XATTR
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        struct super_block *sb;
        char *full_path;
@@ -221,16 +231,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
        if (sb == NULL)
                return -EIO;
-        xid = GetXid();
        cifs_sb = CIFS_SB(sb);
-        pTcon = cifs_sb->tcon;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
+        xid = GetXid();
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto get_ea_exit;
-                return rc;
        }
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
@@ -323,6 +335,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 get_ea_exit:
        kfree(full_path);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
 #endif
        return rc;
 }
@@ -333,6 +346,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 #ifdef CONFIG_CIFS_XATTR
        int xid;
        struct cifs_sb_info *cifs_sb;
+        struct tcon_link *tlink;
        struct cifsTconInfo *pTcon;
        struct super_block *sb;
        char *full_path;
@@ -346,18 +360,20 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
                return -EIO;
        cifs_sb = CIFS_SB(sb);
-        pTcon = cifs_sb->tcon;
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                return -EOPNOTSUPP;
+        tlink = cifs_sb_tlink(cifs_sb);
+        if (IS_ERR(tlink))
+                return PTR_ERR(tlink);
+        pTcon = tlink_tcon(tlink);
        xid = GetXid();
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto list_ea_exit;
-                return rc;
        }
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
@@ -370,8 +386,10 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+list_ea_exit:
        kfree(full_path);
        FreeXid(xid);
+        cifs_put_tlink(tlink);
 #endif
        return rc;
 }
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index a5bf5771a22a..9060f08e70cf 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/spinlock.h>
 #include <linux/coda.h>
 #include <linux/coda_linux.h>
@@ -31,19 +32,23 @@ void coda_cache_enter(struct inode *inode, int mask)
 {
        struct coda_inode_info *cii = ITOC(inode);
+        spin_lock(&cii->c_lock);
        cii->c_cached_epoch = atomic_read(&permission_epoch);
        if (cii->c_uid != current_fsuid()) {
                cii->c_uid = current_fsuid();
                cii->c_cached_perm = mask;
        } else
                cii->c_cached_perm |= mask;
+        spin_unlock(&cii->c_lock);
 }
 /* remove cached acl from an inode */
 void coda_cache_clear_inode(struct inode *inode)
 {
        struct coda_inode_info *cii = ITOC(inode);
+        spin_lock(&cii->c_lock);
        cii->c_cached_epoch = atomic_read(&permission_epoch) - 1;
+        spin_unlock(&cii->c_lock);
 }
 /* remove all acl caches */
@@ -57,13 +62,15 @@ void coda_cache_clear_all(struct super_block *sb)
 int coda_cache_check(struct inode *inode, int mask)
 {
        struct coda_inode_info *cii = ITOC(inode);
-        int hit;
+        int hit;
        
-        hit = (mask & cii->c_cached_perm) == mask &&
+        spin_lock(&cii->c_lock);
-                cii->c_uid == current_fsuid() &&
+        hit = (mask & cii->c_cached_perm) == mask &&
-                cii->c_cached_epoch == atomic_read(&permission_epoch);
+            cii->c_uid == current_fsuid() &&
+            cii->c_cached_epoch == atomic_read(&permission_epoch);
+        spin_unlock(&cii->c_lock);
-        return hit;
+        return hit;
 }
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index a7a780929eec..602240569c89 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -45,13 +45,15 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
 static int coda_test_inode(struct inode *inode, void *data)
 {
        struct CodaFid *fid = (struct CodaFid *)data;
-        return coda_fideq(&(ITOC(inode)->c_fid), fid);
+        struct coda_inode_info *cii = ITOC(inode);
+        return coda_fideq(&cii->c_fid, fid);
 }
 static int coda_set_inode(struct inode *inode, void *data)
 {
        struct CodaFid *fid = (struct CodaFid *)data;
-        ITOC(inode)->c_fid = *fid;
+        struct coda_inode_info *cii = ITOC(inode);
+        cii->c_fid = *fid;
        return 0;
 }
@@ -71,6 +73,7 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
                cii = ITOC(inode);
                /* we still need to set i_ino for things like stat(2) */
                inode->i_ino = hash;
+                /* inode is locked and unique, no need to grab cii->c_lock */
                cii->c_mapcount = 0;
                unlock_new_inode(inode);
        }
@@ -107,14 +110,20 @@ int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_bloc
 }
+/* Although we treat Coda file identifiers as immutable, there is one
+ * special case for files created during a disconnection where they may
+ * not be globally unique. When an identifier collision is detected we
+ * first try to flush the cached inode from the kernel and finally
+ * resort to renaming/rehashing in-place. Userspace remembers both old
+ * and new values of the identifier to handle any in-flight upcalls.
+ * The real solution is to use globally unique UUIDs as identifiers, but
+ * retrofitting the existing userspace code for this is non-trivial. */
 void coda_replace_fid(struct inode *inode, struct CodaFid *oldfid, 
                      struct CodaFid *newfid)
 {
-        struct coda_inode_info *cii;
+        struct coda_inode_info *cii = ITOC(inode);
        unsigned long hash = coda_f2i(newfid);
        
-        cii = ITOC(inode);
        BUG_ON(!coda_fideq(&cii->c_fid, oldfid));
        /* replace fid and rehash inode */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index ccd98b0f2b0b..5d8b35539601 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -17,7 +17,7 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 #include <asm/uaccess.h>
@@ -116,15 +116,11 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
                goto exit;
        }
-        lock_kernel();
        error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
                             &type, &resfid);
        if (!error)
                error = coda_cnode_make(&inode, &resfid, dir->i_sb);
-        unlock_kernel();
        if (error && error != -ENOENT)
                return ERR_PTR(error);
@@ -140,28 +136,24 @@ exit:
 int coda_permission(struct inode *inode, int mask)
 {
-        int error = 0;
+        int error;
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
        if (!mask)
-                return 0; 
+                return 0;
        if ((mask & MAY_EXEC) && !execute_ok(inode))
                return -EACCES;
-        lock_kernel();
        if (coda_cache_check(inode, mask))
-                goto out; 
+                return 0;
-        error = venus_access(inode->i_sb, coda_i2f(inode), mask);
+        error = venus_access(inode->i_sb, coda_i2f(inode), mask);
    
        if (!error)
                coda_cache_enter(inode, mask);
- out:
-        unlock_kernel();
        return error;
 }
@@ -200,41 +192,34 @@ static inline void coda_dir_drop_nlink(struct inode *dir)
 /* creation routines: create, mknod, mkdir, link, symlink */
 static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd)
 {
-        int error=0;
+        int error;
        const char *name=de->d_name.name;
        int length=de->d_name.len;
        struct inode *inode;
        struct CodaFid newfid;
        struct coda_vattr attrs;
-        lock_kernel();
+        if (coda_isroot(dir) && coda_iscontrol(name, length))
-        if (coda_isroot(dir) && coda_iscontrol(name, length)) {
-                unlock_kernel();
                return -EPERM;
-        }
        error = venus_create(dir->i_sb, coda_i2f(dir), name, length, 
                                0, mode, &newfid, &attrs);
+        if (error)
-        if ( error ) {
+                goto err_out;
-                unlock_kernel();
-                d_drop(de);
-                return error;
-        }
        inode = coda_iget(dir->i_sb, &newfid, &attrs);
-        if ( IS_ERR(inode) ) {
+        if (IS_ERR(inode)) {
-                unlock_kernel();
+                error = PTR_ERR(inode);
-                d_drop(de);
+                goto err_out;
-                return PTR_ERR(inode);
        }
        /* invalidate the directory cnode's attributes */
        coda_dir_update_mtime(dir);
-        unlock_kernel();
        d_instantiate(de, inode);
        return 0;
+err_out:
+        d_drop(de);
+        return error;
 }
 static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
@@ -246,36 +231,29 @@ static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
        int error;
        struct CodaFid newfid;
-        lock_kernel();
+        if (coda_isroot(dir) && coda_iscontrol(name, len))
-        if (coda_isroot(dir) && coda_iscontrol(name, len)) {
-                unlock_kernel();
                return -EPERM;
-        }
        attrs.va_mode = mode;
        error = venus_mkdir(dir->i_sb, coda_i2f(dir), 
                               name, len, &newfid, &attrs);
-        
+        if (error)
-        if ( error ) {
+                goto err_out;
-                unlock_kernel();
-                d_drop(de);
-                return error;
-        }
         
        inode = coda_iget(dir->i_sb, &newfid, &attrs);
-        if ( IS_ERR(inode) ) {
+        if (IS_ERR(inode)) {
-                unlock_kernel();
+                error = PTR_ERR(inode);
-                d_drop(de);
+                goto err_out;
-                return PTR_ERR(inode);
        }
        /* invalidate the directory cnode's attributes */
        coda_dir_inc_nlink(dir);
        coda_dir_update_mtime(dir);
-        unlock_kernel();
        d_instantiate(de, inode);
        return 0;
+err_out:
+        d_drop(de);
+        return error;
 }
 /* try to make de an entry in dir_inodde linked to source_de */ 
@@ -287,52 +265,38 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
        int len = de->d_name.len;
        int error;
-        lock_kernel();
+        if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
-        if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
-                unlock_kernel();
                return -EPERM;
-        }
        error = venus_link(dir_inode->i_sb, coda_i2f(inode),
                           coda_i2f(dir_inode), (const char *)name, len);
        if (error) {
                d_drop(de);
-                goto out;
+                return error;
        }
        coda_dir_update_mtime(dir_inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(de, inode);
        inc_nlink(inode);
+        return 0;
-out:
-        unlock_kernel();
-        return(error);
 }
 static int coda_symlink(struct inode *dir_inode, struct dentry *de,
                        const char *symname)
 {
-        const char *name = de->d_name.name;
+        const char *name = de->d_name.name;
        int len = de->d_name.len;
        int symlen;
-        int error = 0;
+        int error;
-        lock_kernel();
-        if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
+        if (coda_isroot(dir_inode) && coda_iscontrol(name, len))
-                unlock_kernel();
                return -EPERM;
-        }
        symlen = strlen(symname);
-        if ( symlen > CODA_MAXPATHLEN ) {
+        if (symlen > CODA_MAXPATHLEN)
-                unlock_kernel();
                return -ENAMETOOLONG;
-        }
        /*
         * This entry is now negative. Since we do not create
@@ -343,10 +307,9 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de,
                              symname, symlen);
        /* mtime is no good anymore */
-        if ( !error )
+        if (!error)
                coda_dir_update_mtime(dir_inode);
-        unlock_kernel();
        return error;
 }
@@ -357,17 +320,12 @@ static int coda_unlink(struct inode *dir, struct dentry *de)
        const char *name = de->d_name.name;
        int len = de->d_name.len;
-        lock_kernel();
        error = venus_remove(dir->i_sb, coda_i2f(dir), name, len);
-        if ( error ) {
+        if (error)
-                unlock_kernel();
                return error;
-        }
        coda_dir_update_mtime(dir);
        drop_nlink(de->d_inode);
-        unlock_kernel();
        return 0;
 }
@@ -377,8 +335,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
        int len = de->d_name.len;
        int error;
-        lock_kernel();
        error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
        if (!error) {
                /* VFS may delete the child */
@@ -389,7 +345,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
                coda_dir_drop_nlink(dir);
                coda_dir_update_mtime(dir);
        }
-        unlock_kernel();
        return error;
 }
@@ -403,15 +358,12 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
        int new_length = new_dentry->d_name.len;
        int error;
-        lock_kernel();
        error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
                             coda_i2f(new_dir), old_length, new_length,
                             (const char *) old_name, (const char *)new_name);
+        if (!error) {
-        if ( !error ) {
+                if (new_dentry->d_inode) {
-                if ( new_dentry->d_inode ) {
+                        if (S_ISDIR(new_dentry->d_inode->i_mode)) {
-                        if ( S_ISDIR(new_dentry->d_inode->i_mode) ) {
                                coda_dir_drop_nlink(old_dir);
                                coda_dir_inc_nlink(new_dir);
                        }
@@ -423,8 +375,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
                        coda_flag_inode(new_dir, C_VATTR);
                }
        }
-        unlock_kernel();
        return error;
 }
@@ -594,10 +544,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
        struct inode *inode = de->d_inode;
        struct coda_inode_info *cii;
-        if (!inode)
+        if (!inode || coda_isroot(inode))
-                return 1;
-        lock_kernel();
-        if (coda_isroot(inode))
                goto out;
        if (is_bad_inode(inode))
                goto bad;
@@ -617,13 +564,12 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
                goto out;
        /* clear the flags. */
+        spin_lock(&cii->c_lock);
        cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
+        spin_unlock(&cii->c_lock);
 bad:
-        unlock_kernel();
        return 0;
 out:
-        unlock_kernel();
        return 1;
 }
@@ -656,20 +602,19 @@ static int coda_dentry_delete(struct dentry * dentry)
 int coda_revalidate_inode(struct dentry *dentry)
 {
        struct coda_vattr attr;
-        int error = 0;
+        int error;
        int old_mode;
        ino_t old_ino;
        struct inode *inode = dentry->d_inode;
        struct coda_inode_info *cii = ITOC(inode);
-        lock_kernel();
+        if (!cii->c_flags)
-        if ( !cii->c_flags )
+                return 0;
-                goto ok;
        if (cii->c_flags & (C_VATTR | C_PURGE | C_FLUSH)) {
                error = venus_getattr(inode->i_sb, &(cii->c_fid), &attr);
-                if ( error )
+                if (error)
-                        goto return_bad;
+                        return -EIO;
                /* this inode may be lost if:
                   - it's ino changed 
@@ -688,17 +633,13 @@ int coda_revalidate_inode(struct dentry *dentry)
                /* the following can happen when a local fid is replaced 
                   with a global one, here we lose and declare the inode bad */
                if (inode->i_ino != old_ino)
-                        goto return_bad;
+                        return -EIO;
                
                coda_flag_inode_children(inode, C_FLUSH);
+                spin_lock(&cii->c_lock);
                cii->c_flags &= ~(C_VATTR | C_PURGE | C_FLUSH);
+                spin_unlock(&cii->c_lock);
        }
-ok:
-        unlock_kernel();
        return 0;
-return_bad:
-        unlock_kernel();
-        return -EIO;
 }
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ad3cd2abeeb4..c8b50ba4366a 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -15,7 +15,7 @@
 #include <linux/stat.h>
 #include <linux/cred.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
@@ -109,19 +109,24 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
        coda_inode = coda_file->f_path.dentry->d_inode;
        host_inode = host_file->f_path.dentry->d_inode;
+        cii = ITOC(coda_inode);
+        spin_lock(&cii->c_lock);
        coda_file->f_mapping = host_file->f_mapping;
        if (coda_inode->i_mapping == &coda_inode->i_data)
                coda_inode->i_mapping = host_inode->i_mapping;
        /* only allow additional mmaps as long as userspace isn't changing
         * the container file on us! */
-        else if (coda_inode->i_mapping != host_inode->i_mapping)
+        else if (coda_inode->i_mapping != host_inode->i_mapping) {
+                spin_unlock(&cii->c_lock);
                return -EBUSY;
+        }
        /* keep track of how often the coda_inode/host_file has been mmapped */
-        cii = ITOC(coda_inode);
        cii->c_mapcount++;
        cfi->cfi_mapcount++;
+        spin_unlock(&cii->c_lock);
        return host_file->f_op->mmap(host_file, vma);
 }
@@ -138,8 +143,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
        if (!cfi)
                return -ENOMEM;
-        lock_kernel();
        error = venus_open(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags,
                           &host_file);
        if (!host_file)
@@ -147,7 +150,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
        if (error) {
                kfree(cfi);
-                unlock_kernel();
                return error;
        }
@@ -159,8 +161,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
        BUG_ON(coda_file->private_data != NULL);
        coda_file->private_data = cfi;
-        unlock_kernel();
        return 0;
 }
@@ -171,9 +171,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
        struct coda_file_info *cfi;
        struct coda_inode_info *cii;
        struct inode *host_inode;
-        int err = 0;
+        int err;
-        lock_kernel();
        cfi = CODA_FTOC(coda_file);
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -185,18 +183,18 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
        cii = ITOC(coda_inode);
        /* did we mmap this file? */
+        spin_lock(&cii->c_lock);
        if (coda_inode->i_mapping == &host_inode->i_data) {
                cii->c_mapcount -= cfi->cfi_mapcount;
                if (!cii->c_mapcount)
                        coda_inode->i_mapping = &coda_inode->i_data;
        }
+        spin_unlock(&cii->c_lock);
        fput(cfi->cfi_container);
        kfree(coda_file->private_data);
        coda_file->private_data = NULL;
-        unlock_kernel();
        /* VFS fput ignores the return value from file_operations->release, so
         * there is no use returning an error here */
        return 0;
@@ -207,7 +205,7 @@ int coda_fsync(struct file *coda_file, int datasync)
        struct file *host_file;
        struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
        struct coda_file_info *cfi;
-        int err = 0;
+        int err;
        if (!(S_ISREG(coda_inode->i_mode) || S_ISDIR(coda_inode->i_mode) ||
              S_ISLNK(coda_inode->i_mode)))
@@ -218,11 +216,8 @@ int coda_fsync(struct file *coda_file, int datasync)
        host_file = cfi->cfi_container;
        err = vfs_fsync(host_file, datasync);
-        if ( !err && !datasync ) {
+        if (!err && !datasync)
-                lock_kernel();
                err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
-                unlock_kernel();
-        }
        return err;
 }
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6526e6f21ecf..5ea57c8c7f97 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -15,7 +15,8 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/unistd.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/file.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
@@ -51,6 +52,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
        ei->c_flags = 0;
        ei->c_uid = 0;
        ei->c_cached_perm = 0;
+        spin_lock_init(&ei->c_lock);
        return &ei->vfs_inode;
 }
@@ -143,7 +145,7 @@ static int get_device_index(struct coda_mount_data *data)
 static int coda_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct inode *root = NULL;
-        struct venus_comm *vc = NULL;
+        struct venus_comm *vc;
        struct CodaFid fid;
        int error;
        int idx;
@@ -157,21 +159,26 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        printk(KERN_INFO "coda_read_super: device index: %i\n", idx);
        vc = &coda_comms[idx];
+        mutex_lock(&vc->vc_mutex);
        if (!vc->vc_inuse) {
                printk("coda_read_super: No pseudo device\n");
-                return -EINVAL;
+                error = -EINVAL;
+                goto unlock_out;
        }
-        if ( vc->vc_sb ) {
+        if (vc->vc_sb) {
                printk("coda_read_super: Device already mounted\n");
-                return -EBUSY;
+                error = -EBUSY;
+                goto unlock_out;
        }
        error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
        if (error)
-                goto bdi_err;
+                goto unlock_out;
        vc->vc_sb = sb;
+        mutex_unlock(&vc->vc_mutex);
        sb->s_fs_info = vc;
        sb->s_flags |= MS_NOATIME;
@@ -200,26 +207,33 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        printk("coda_read_super: rootinode is %ld dev %s\n", 
               root->i_ino, root->i_sb->s_id);
        sb->s_root = d_alloc_root(root);
-        if (!sb->s_root)
+        if (!sb->s_root) {
+                error = -EINVAL;
                goto error;
-        return 0;
+        }
+        return 0;
- error:
+error:
-        bdi_destroy(&vc->bdi);
- bdi_err:
        if (root)
                iput(root);
-        if (vc)
-                vc->vc_sb = NULL;
-        return -EINVAL;
+        mutex_lock(&vc->vc_mutex);
+        bdi_destroy(&vc->bdi);
+        vc->vc_sb = NULL;
+        sb->s_fs_info = NULL;
+unlock_out:
+        mutex_unlock(&vc->vc_mutex);
+        return error;
 }
 static void coda_put_super(struct super_block *sb)
 {
-        bdi_destroy(&coda_vcp(sb)->bdi);
+        struct venus_comm *vcp = coda_vcp(sb);
-        coda_vcp(sb)->vc_sb = NULL;
+        mutex_lock(&vcp->vc_mutex);
+        bdi_destroy(&vcp->bdi);
+        vcp->vc_sb = NULL;
        sb->s_fs_info = NULL;
+        mutex_unlock(&vcp->vc_mutex);
        printk("Coda: Bye bye.\n");
 }
@@ -245,8 +259,6 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
        struct coda_vattr vattr;
        int error;
-        lock_kernel();
-        
        memset(&vattr, 0, sizeof(vattr)); 
        inode->i_ctime = CURRENT_TIME_SEC;
@@ -256,13 +268,10 @@ int coda_setattr(struct dentry *de, struct iattr *iattr)
        /* Venus is responsible for truncating the container-file!!! */
        error = venus_setattr(inode->i_sb, coda_i2f(inode), &vattr);
-        if ( !error ) {
+        if (!error) {
                coda_vattr_to_iattr(inode, &vattr); 
                coda_cache_clear_inode(inode);
        }
-        unlock_kernel();
        return error;
 }
@@ -276,12 +285,8 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        int error;
        
-        lock_kernel();
        error = venus_statfs(dentry, buf);
-        unlock_kernel();
        if (error) {
                /* fake something like AFS does */
                buf->f_blocks = 9000000;
@@ -301,16 +306,16 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 /* init_coda: used by filesystems.c to register coda */
-static int coda_get_sb(struct file_system_type *fs_type,
+static struct dentry *coda_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, coda_fill_super);
 }
 struct file_system_type coda_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "coda",
-        .get_sb         = coda_get_sb,
+        .mount          = coda_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = FS_BINARY_MOUNTDATA,
 };
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index ca25d96d45c9..2fd89b5c5c7b 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -23,8 +23,6 @@
 #include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
-#include <linux/smp_lock.h>
 /* pioctl ops */
 static int coda_ioctl_permission(struct inode *inode, int mask);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
@@ -39,6 +37,7 @@ const struct inode_operations coda_ioctl_inode_operations = {
 const struct file_operations coda_ioctl_operations = {
        .owner          = THIS_MODULE,
        .unlocked_ioctl = coda_pioctl,
+        .llseek         = noop_llseek,
 };
 /* the coda pioctl inode ops */
@@ -57,13 +56,9 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
        struct inode *target_inode = NULL;
        struct coda_inode_info *cnp;
-        lock_kernel();
        /* get the Pioctl data arguments from user space */
-        if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
+        if (copy_from_user(&data, (void __user *)user_data, sizeof(data)))
-                error = -EINVAL;
+                return -EINVAL;
-                goto out;
-        }
        /*
         * Look up the pathname. Note that the pathname is in
@@ -75,13 +70,12 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
                error = user_lpath(data.path, &path);
        if (error)
-                goto out;
+                return error;
-        else
-                target_inode = path.dentry->d_inode;
+        target_inode = path.dentry->d_inode;
        /* return if it is not a Coda inode */
        if (target_inode->i_sb != inode->i_sb) {
-                path_put(&path);
                error = -EINVAL;
                goto out;
        }
@@ -90,10 +84,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
        cnp = ITOC(target_inode);
        error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
-        path_put(&path);
 out:
-        unlock_kernel();
+        path_put(&path);
        return error;
 }
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 116af7546cf0..62647a8595e4 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -35,7 +35,7 @@
 #include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include <linux/device.h>
 #include <asm/io.h>
 #include <asm/system.h>
@@ -67,8 +67,10 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
        unsigned int mask = POLLOUT | POLLWRNORM;
        poll_wait(file, &vcp->vc_waitq, wait);
+        mutex_lock(&vcp->vc_mutex);
        if (!list_empty(&vcp->vc_pending))
                mask |= POLLIN | POLLRDNORM;
+        mutex_unlock(&vcp->vc_mutex);
        return mask;
 }
@@ -108,16 +110,9 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
                return -EFAULT;
        if (DOWNCALL(hdr.opcode)) {
-                struct super_block *sb = NULL;
+                union outputArgs *dcbuf;
-                union outputArgs *dcbuf;
                int size = sizeof(*dcbuf);
-                sb = vcp->vc_sb;
-                if ( !sb ) {
-                        count = nbytes;
-                        goto out;
-                }
                if  ( nbytes < sizeof(struct coda_out_hdr) ) {
                        printk("coda_downcall opc %d uniq %d, not enough!\n",
                               hdr.opcode, hdr.unique);
@@ -137,9 +132,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
                }
                /* what downcall errors does Venus handle ? */
-                lock_kernel();
+                error = coda_downcall(vcp, hdr.opcode, dcbuf);
-                error = coda_downcall(hdr.opcode, dcbuf, sb);
-                unlock_kernel();
                CODA_FREE(dcbuf, nbytes);
                if (error) {
@@ -152,7 +145,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
        }
        
        /* Look for the message on the processing queue. */
-        lock_kernel();
+        mutex_lock(&vcp->vc_mutex);
        list_for_each(lh, &vcp->vc_processing) {
                tmp = list_entry(lh, struct upc_req , uc_chain);
                if (tmp->uc_unique == hdr.unique) {
@@ -161,7 +154,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
                        break;
                }
        }
-        unlock_kernel();
+        mutex_unlock(&vcp->vc_mutex);
        if (!req) {
                printk("psdev_write: msg (%d, %d) not found\n", 
@@ -216,7 +209,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
        if (nbytes == 0)
                return 0;
-        lock_kernel();
+        mutex_lock(&vcp->vc_mutex);
        add_wait_queue(&vcp->vc_waitq, &wait);
        set_current_state(TASK_INTERRUPTIBLE);
@@ -230,7 +223,9 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
                        retval = -ERESTARTSYS;
                        break;
                }
+                mutex_unlock(&vcp->vc_mutex);
                schedule();
+                mutex_lock(&vcp->vc_mutex);
        }
        set_current_state(TASK_RUNNING);
@@ -263,7 +258,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
        CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
        kfree(req);
 out:
-        unlock_kernel();
+        mutex_unlock(&vcp->vc_mutex);
        return (count ? count : retval);
 }
@@ -276,10 +271,10 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
        if (idx < 0 || idx >= MAX_CODADEVS)
                return -ENODEV;
-        lock_kernel();
        err = -EBUSY;
        vcp = &coda_comms[idx];
+        mutex_lock(&vcp->vc_mutex);
        if (!vcp->vc_inuse) {
                vcp->vc_inuse++;
@@ -293,7 +288,7 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
                err = 0;
        }
-        unlock_kernel();
+        mutex_unlock(&vcp->vc_mutex);
        return err;
 }
@@ -308,7 +303,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
                return -1;
        }
-        lock_kernel();
+        mutex_lock(&vcp->vc_mutex);
        /* Wakeup clients so they can return. */
        list_for_each_entry_safe(req, tmp, &vcp->vc_pending, uc_chain) {
@@ -333,7 +328,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
        file->private_data = NULL;
        vcp->vc_inuse--;
-        unlock_kernel();
+        mutex_unlock(&vcp->vc_mutex);
        return 0;
 }
@@ -346,6 +341,7 @@ static const struct file_operations coda_psdev_fops = {
        .unlocked_ioctl = coda_psdev_ioctl,
        .open           = coda_psdev_open,
        .release        = coda_psdev_release,
+        .llseek         = noop_llseek,
 };
 static int init_coda_psdev(void)
@@ -361,9 +357,11 @@ static int init_coda_psdev(void)
                err = PTR_ERR(coda_psdev_class);
                goto out_chrdev;
        }               
-        for (i = 0; i < MAX_CODADEVS; i++)
+        for (i = 0; i < MAX_CODADEVS; i++) {
+                mutex_init(&(&coda_comms[i])->vc_mutex);
                device_create(coda_psdev_class, NULL,
                              MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
+        }
        coda_sysctl_init();
        goto out;
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 4513b7258458..af78f007a2b0 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -14,7 +14,6 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/coda.h>
 #include <linux/coda_linux.h>
@@ -29,11 +28,9 @@ static int coda_symlink_filler(struct file *file, struct page *page)
        unsigned int len = PAGE_SIZE;
        char *p = kmap(page);
-        lock_kernel();
        cii = ITOC(inode);
        error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
-        unlock_kernel();
        if (error)
                goto fail;
        SetPageUptodate(page);
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index b8893ab6f9e6..c3563cab9758 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -27,6 +27,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
@@ -606,7 +607,8 @@ static void coda_unblock_signals(sigset_t *old)
                                 (r)->uc_opcode != CODA_RELEASE) || \
                                (r)->uc_flags & CODA_REQ_READ))
-static inline void coda_waitfor_upcall(struct upc_req *req)
+static inline void coda_waitfor_upcall(struct venus_comm *vcp,
+                                       struct upc_req *req)
 {
        DECLARE_WAITQUEUE(wait, current);
        unsigned long timeout = jiffies + coda_timeout * HZ;
@@ -639,10 +641,12 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
                        break;
                }
+                mutex_unlock(&vcp->vc_mutex);
                if (blocked)
                        schedule_timeout(HZ);
                else
                        schedule();
+                mutex_lock(&vcp->vc_mutex);
        }
        if (blocked)
                coda_unblock_signals(&old);
@@ -667,18 +671,23 @@ static int coda_upcall(struct venus_comm *vcp,
 {
        union outputArgs *out;
        union inputArgs *sig_inputArgs;
-        struct upc_req *req, *sig_req;
+        struct upc_req *req = NULL, *sig_req;
-        int error = 0;
+        int error;
+        mutex_lock(&vcp->vc_mutex);
        if (!vcp->vc_inuse) {
                printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
-                return -ENXIO;
+                error = -ENXIO;
+                goto exit;
        }
        /* Format the request message. */
        req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
-        if (!req)
+        if (!req) {
-                return -ENOMEM;
+                error = -ENOMEM;
+                goto exit;
+        }
        req->uc_data = (void *)buffer;
        req->uc_flags = 0;
@@ -705,7 +714,7 @@ static int coda_upcall(struct venus_comm *vcp,
         * ENODEV.  */
        /* Go to sleep.  Wake up on signals only after the timeout. */
-        coda_waitfor_upcall(req);
+        coda_waitfor_upcall(vcp, req);
        /* Op went through, interrupt or not... */
        if (req->uc_flags & CODA_REQ_WRITE) {
@@ -759,6 +768,7 @@ static int coda_upcall(struct venus_comm *vcp,
 exit:
        kfree(req);
+        mutex_unlock(&vcp->vc_mutex);
        return error;
 }
@@ -796,21 +806,24 @@ exit:
 *
 * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
-int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
 {
        struct inode *inode = NULL;
-        struct CodaFid *fid, *newfid;
+        struct CodaFid *fid = NULL, *newfid;
+        struct super_block *sb;
        /* Handle invalidation requests. */
-        if ( !sb || !sb->s_root)
+        mutex_lock(&vcp->vc_mutex);
-                return 0;
+        sb = vcp->vc_sb;
+        if (!sb || !sb->s_root)
+                goto unlock_out;
        switch (opcode) {
        case CODA_FLUSH:
                coda_cache_clear_all(sb);
                shrink_dcache_sb(sb);
                if (sb->s_root->d_inode)
-                    coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
+                        coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
                break;
        case CODA_PURGEUSER:
@@ -819,45 +832,53 @@ int coda_downcall(int opcode, union outputArgs * out, struct super_block *sb)
        case CODA_ZAPDIR:
                fid = &out->coda_zapdir.CodaFid;
-                inode = coda_fid_to_inode(fid, sb);
-                if (inode) {
-                        coda_flag_inode_children(inode, C_PURGE);
-                        coda_flag_inode(inode, C_VATTR);
-                }
                break;
        case CODA_ZAPFILE:
                fid = &out->coda_zapfile.CodaFid;
-                inode = coda_fid_to_inode(fid, sb);
-                if (inode)
-                        coda_flag_inode(inode, C_VATTR);
                break;
        case CODA_PURGEFID:
                fid = &out->coda_purgefid.CodaFid;
+                break;
+        case CODA_REPLACE:
+                fid = &out->coda_replace.OldFid;
+                break;
+        }
+        if (fid)
                inode = coda_fid_to_inode(fid, sb);
-                if (inode) {
-                        coda_flag_inode_children(inode, C_PURGE);
-                        /* catch the dentries later if some are still busy */
+unlock_out:
-                        coda_flag_inode(inode, C_PURGE);
+        mutex_unlock(&vcp->vc_mutex);
-                        d_prune_aliases(inode);
-                }
+        if (!inode)
+                return 0;
+        switch (opcode) {
+        case CODA_ZAPDIR:
+                coda_flag_inode_children(inode, C_PURGE);
+                coda_flag_inode(inode, C_VATTR);
+                break;
+        case CODA_ZAPFILE:
+                coda_flag_inode(inode, C_VATTR);
+                break;
+        case CODA_PURGEFID:
+                coda_flag_inode_children(inode, C_PURGE);
+                /* catch the dentries later if some are still busy */
+                coda_flag_inode(inode, C_PURGE);
+                d_prune_aliases(inode);
                break;
        case CODA_REPLACE:
-                fid = &out->coda_replace.OldFid;
                newfid = &out->coda_replace.NewFid;
-                inode = coda_fid_to_inode(fid, sb);
+                coda_replace_fid(inode, fid, newfid);
-                if (inode)
-                        coda_replace_fid(inode, fid, newfid);
                break;
        }
+        iput(inode);
-        if (inode)
-                iput(inode);
        return 0;
 }
diff --git a/fs/compat.c b/fs/compat.c
index 0644a154672b..c580c322fa6b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -29,8 +29,6 @@
 #include <linux/vfs.h>
 #include <linux/ioctl.h>
 #include <linux/init.h>
-#include <linux/smb.h>
-#include <linux/smb_mount.h>
 #include <linux/ncp_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/syscalls.h>
@@ -51,6 +49,7 @@
 #include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -608,14 +607,14 @@ ssize_t compat_rw_copy_check_uvector(int type,
        /*
         * Single unix specification:
         * We should -EINVAL if an element length is not >= 0 and fitting an
-         * ssize_t.  The total length is fitting an ssize_t
+         * ssize_t.
         *
-         * Be careful here because iov_len is a size_t not an ssize_t
+         * In Linux, the total length is limited to MAX_RW_COUNT, there is
+         * no overflow possibility.
         */
        tot_len = 0;
        ret = -EINVAL;
        for (seg = 0; seg < nr_segs; seg++) {
-                compat_ssize_t tmp = tot_len;
                compat_uptr_t buf;
                compat_ssize_t len;
@@ -626,13 +625,13 @@ ssize_t compat_rw_copy_check_uvector(int type,
                }
                if (len < 0)    /* size_t not fitting in compat_ssize_t .. */
                        goto out;
-                tot_len += len;
-                if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
-                        goto out;
                if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
                        ret = -EFAULT;
                        goto out;
                }
+                if (len > MAX_RW_COUNT - tot_len)
+                        len = MAX_RW_COUNT - tot_len;
+                tot_len += len;
                iov->iov_base = compat_ptr(buf);
                iov->iov_len = (compat_size_t) len;
                uvector++;
@@ -745,30 +744,6 @@ static void *do_ncp_super_data_conv(void *raw_data)
        return raw_data;
 }
-struct compat_smb_mount_data {
-        compat_int_t version;
-        __compat_uid_t mounted_uid;
-        __compat_uid_t uid;
-        __compat_gid_t gid;
-        compat_mode_t file_mode;
-        compat_mode_t dir_mode;
-};
-static void *do_smb_super_data_conv(void *raw_data)
-{
-        struct smb_mount_data *s = raw_data;
-        struct compat_smb_mount_data *c_s = raw_data;
-        if (c_s->version != SMB_MOUNT_OLDVERSION)
-                goto out;
-        s->dir_mode = c_s->dir_mode;
-        s->file_mode = c_s->file_mode;
-        s->gid = c_s->gid;
-        s->uid = c_s->uid;
-        s->mounted_uid = c_s->mounted_uid;
- out:
-        return raw_data;
-}
 struct compat_nfs_string {
        compat_uint_t len;
@@ -835,7 +810,6 @@ static int do_nfs4_super_data_conv(void *raw_data)
        return 0;
 }
-#define SMBFS_NAME      "smbfs"
 #define NCPFS_NAME      "ncpfs"
 #define NFS4_NAME       "nfs4"
@@ -870,9 +844,7 @@ asmlinkage long compat_sys_mount(const char __user * dev_name,
        retval = -EINVAL;
        if (kernel_type && data_page) {
-                if (!strcmp(kernel_type, SMBFS_NAME)) {
+                if (!strcmp(kernel_type, NCPFS_NAME)) {
-                        do_smb_super_data_conv((void *)data_page);
-                } else if (!strcmp(kernel_type, NCPFS_NAME)) {
                        do_ncp_super_data_conv((void *)data_page);
                } else if (!strcmp(kernel_type, NFS4_NAME)) {
                        if (do_nfs4_super_data_conv((void *) data_page))
@@ -1963,7 +1935,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 }
 #endif /* HAVE_SET_RESTORE_SIGMASK */
-#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)
+#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED)
 /* Stuff for NFS server syscalls... */
 struct compat_nfsctl_svc {
        u16                     svc32_port;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 03e59aa318eb..410ed188faa1 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -46,7 +46,6 @@
 #include <linux/videodev.h>
 #include <linux/netdevice.h>
 #include <linux/raw.h>
-#include <linux/smb_fs.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/rtc.h>
@@ -558,25 +557,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
 #endif /* CONFIG_BLOCK */
-static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
-                        compat_uid_t __user *argp)
-{
-        mm_segment_t old_fs = get_fs();
-        __kernel_uid_t kuid;
-        int err;
-        cmd = SMB_IOC_GETMOUNTUID;
-        set_fs(KERNEL_DS);
-        err = sys_ioctl(fd, cmd, (unsigned long)&kuid);
-        set_fs(old_fs);
-        if (err >= 0)
-                err = put_user(kuid, argp);
-        return err;
-}
 /* Bluetooth ioctls */
 #define HCIUARTSETPROTO         _IOW('U', 200, int)
 #define HCIUARTGETPROTO         _IOR('U', 201, int)
@@ -599,69 +579,6 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
 #define HIDPGETCONNLIST _IOR('H', 210, int)
 #define HIDPGETCONNINFO _IOR('H', 211, int)
-#ifdef CONFIG_BLOCK
-struct raw32_config_request
-{
-        compat_int_t    raw_minor;
-        __u64   block_major;
-        __u64   block_minor;
-} __attribute__((packed));
-static int get_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
-{
-        int ret;
-        if (!access_ok(VERIFY_READ, user_req, sizeof(struct raw32_config_request)))
-                return -EFAULT;
-        ret = __get_user(req->raw_minor, &user_req->raw_minor);
-        ret |= __get_user(req->block_major, &user_req->block_major);
-        ret |= __get_user(req->block_minor, &user_req->block_minor);
-        return ret ? -EFAULT : 0;
-}
-static int set_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
-{
-        int ret;
-        if (!access_ok(VERIFY_WRITE, user_req, sizeof(struct raw32_config_request)))
-                return -EFAULT;
-        ret = __put_user(req->raw_minor, &user_req->raw_minor);
-        ret |= __put_user(req->block_major, &user_req->block_major);
-        ret |= __put_user(req->block_minor, &user_req->block_minor);
-        return ret ? -EFAULT : 0;
-}
-static int raw_ioctl(unsigned fd, unsigned cmd,
-                struct raw32_config_request __user *user_req)
-{
-        int ret;
-        switch (cmd) {
-        case RAW_SETBIND:
-        default: {      /* RAW_GETBIND */
-                struct raw_config_request req;
-                mm_segment_t oldfs = get_fs();
-                if ((ret = get_raw32_request(&req, user_req)))
-                        return ret;
-                set_fs(KERNEL_DS);
-                ret = sys_ioctl(fd,cmd,(unsigned long)&req);
-                set_fs(oldfs);
-                if ((!ret) && (cmd == RAW_GETBIND)) {
-                        ret = set_raw32_request(&req, user_req);
-                }
-                break;
-        }
-        }
-        return ret;
-}
-#endif /* CONFIG_BLOCK */
 struct serial_struct32 {
        compat_int_t    type;
@@ -1265,8 +1182,6 @@ COMPATIBLE_IOCTL(OSS_GETVERSION)
 /* Raw devices */
 COMPATIBLE_IOCTL(RAW_SETBIND)
 COMPATIBLE_IOCTL(RAW_GETBIND)
-/* SMB ioctls which do not need any translations */
-COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
 /* Watchdog */
 COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
 COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -1523,15 +1438,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
        case MTIOCGET32:
        case MTIOCPOS32:
                return mt_ioctl_trans(fd, cmd, argp);
-        /* Raw devices */
-        case RAW_SETBIND:
-        case RAW_GETBIND:
-                return raw_ioctl(fd, cmd, argp);
 #endif
-        /* One SMB ioctl needs translations. */
-#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
-        case SMB_IOC_GETMOUNTUID_32:
-                return do_smb_getmountuid(fd, cmd, argp);
        /* Serial */
        case TIOCGSERIAL:
        case TIOCSSERIAL:
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cf78d44a8d6a..253476d78ed8 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -135,6 +135,7 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
        struct inode * inode = new_inode(configfs_sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mapping->a_ops = &configfs_aops;
                inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
                inode->i_op = &configfs_inode_operations;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8c8d64230c2d..7d3607febe1c 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -104,16 +104,16 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
-static int configfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *configfs_do_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt);
+        return mount_single(fs_type, flags, data, configfs_fill_super);
 }
 static struct file_system_type configfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "configfs",
-        .get_sb         = configfs_get_sb,
+        .mount          = configfs_do_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 1e7a33028d33..32fd5fe9ca0e 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -533,17 +533,16 @@ static const struct super_operations cramfs_ops = {
        .statfs         = cramfs_statfs,
 };
-static int cramfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *cramfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
-                           mnt);
 }
 static struct file_system_type cramfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "cramfs",
-        .get_sb         = cramfs_get_sb,
+        .mount          = cramfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/dcache.c b/fs/dcache.c
index 83293be48149..23702a9d4e6d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -67,33 +67,43 @@ struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
 };
-static void __d_free(struct dentry *dentry)
+static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
+static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
+                   size_t *lenp, loff_t *ppos)
+{
+        dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
+        dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+        return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
+static void __d_free(struct rcu_head *head)
 {
+        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
        WARN_ON(!list_empty(&dentry->d_alias));
        if (dname_external(dentry))
                kfree(dentry->d_name.name);
        kmem_cache_free(dentry_cache, dentry); 
 }
-static void d_callback(struct rcu_head *head)
-{
-        struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
-        __d_free(dentry);
-}
 /*
- * no dcache_lock, please.  The caller must decrement dentry_stat.nr_dentry
+ * no dcache_lock, please.
- * inside dcache_lock.
 */
 static void d_free(struct dentry *dentry)
 {
+        percpu_counter_dec(&nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);
        /* if dentry was never inserted into hash, immediate free is OK */
        if (hlist_unhashed(&dentry->d_hash))
-                __d_free(dentry);
+                __d_free(&dentry->d_u.d_rcu);
        else
-                call_rcu(&dentry->d_u.d_rcu, d_callback);
+                call_rcu(&dentry->d_u.d_rcu, __d_free);
 }
 /*
@@ -123,37 +133,34 @@ static void dentry_iput(struct dentry * dentry)
 }
 /*
- * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
+ * dentry_lru_(add|del|move_tail) must be called with dcache_lock held.
 */
 static void dentry_lru_add(struct dentry *dentry)
 {
-        list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+        if (list_empty(&dentry->d_lru)) {
-        dentry->d_sb->s_nr_dentry_unused++;
+                list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-        dentry_stat.nr_unused++;
+                dentry->d_sb->s_nr_dentry_unused++;
-}
+                percpu_counter_inc(&nr_dentry_unused);
+        }
-static void dentry_lru_add_tail(struct dentry *dentry)
-{
-        list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-        dentry->d_sb->s_nr_dentry_unused++;
-        dentry_stat.nr_unused++;
 }
 static void dentry_lru_del(struct dentry *dentry)
 {
        if (!list_empty(&dentry->d_lru)) {
-                list_del(&dentry->d_lru);
+                list_del_init(&dentry->d_lru);
                dentry->d_sb->s_nr_dentry_unused--;
-                dentry_stat.nr_unused--;
+                percpu_counter_dec(&nr_dentry_unused);
        }
 }
-static void dentry_lru_del_init(struct dentry *dentry)
+static void dentry_lru_move_tail(struct dentry *dentry)
 {
-        if (likely(!list_empty(&dentry->d_lru))) {
+        if (list_empty(&dentry->d_lru)) {
-                list_del_init(&dentry->d_lru);
+                list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-                dentry->d_sb->s_nr_dentry_unused--;
+                dentry->d_sb->s_nr_dentry_unused++;
-                dentry_stat.nr_unused--;
+                percpu_counter_inc(&nr_dentry_unused);
+        } else {
+                list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
        }
 }
@@ -172,7 +179,6 @@ static struct dentry *d_kill(struct dentry *dentry)
        struct dentry *parent;
        list_del(&dentry->d_u.d_child);
-        dentry_stat.nr_dentry--;        /* For d_free, below */
        /*drops the locks, at that point nobody can reach this dentry */
        dentry_iput(dentry);
        if (IS_ROOT(dentry))
@@ -237,13 +243,15 @@ repeat:
                if (dentry->d_op->d_delete(dentry))
                        goto unhash_it;
        }
        /* Unreachable? Get rid of it */
        if (d_unhashed(dentry))
                goto kill_it;
-        if (list_empty(&dentry->d_lru)) {
-                dentry->d_flags |= DCACHE_REFERENCED;
+        /* Otherwise leave it cached and ensure it's on the LRU */
-                dentry_lru_add(dentry);
+        dentry->d_flags |= DCACHE_REFERENCED;
-        }
+        dentry_lru_add(dentry);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&dcache_lock);
        return;
@@ -318,11 +326,10 @@ int d_invalidate(struct dentry * dentry)
 EXPORT_SYMBOL(d_invalidate);
 /* This should be called _only_ with dcache_lock held */
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
        atomic_inc(&dentry->d_count);
-        dentry_lru_del_init(dentry);
+        dentry_lru_del(dentry);
        return dentry;
 }
@@ -441,73 +448,27 @@ static void prune_one_dentry(struct dentry * dentry)
                if (dentry->d_op && dentry->d_op->d_delete)
                        dentry->d_op->d_delete(dentry);
-                dentry_lru_del_init(dentry);
+                dentry_lru_del(dentry);
                __d_drop(dentry);
                dentry = d_kill(dentry);
                spin_lock(&dcache_lock);
        }
 }
-/*
+static void shrink_dentry_list(struct list_head *list)
- * Shrink the dentry LRU on a given superblock.
- * @sb   : superblock to shrink dentry LRU.
- * @count: If count is NULL, we prune all dentries on superblock.
- * @flags: If flags is non-zero, we need to do special processing based on
- * which flags are set. This means we don't need to maintain multiple
- * similar copies of this loop.
- */
-static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
 {
-        LIST_HEAD(referenced);
-        LIST_HEAD(tmp);
        struct dentry *dentry;
-        int cnt = 0;
-        BUG_ON(!sb);
+        while (!list_empty(list)) {
-        BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
+                dentry = list_entry(list->prev, struct dentry, d_lru);
-        spin_lock(&dcache_lock);
+                dentry_lru_del(dentry);
-        if (count != NULL)
-                /* called from prune_dcache() and shrink_dcache_parent() */
-                cnt = *count;
-restart:
-        if (count == NULL)
-                list_splice_init(&sb->s_dentry_lru, &tmp);
-        else {
-                while (!list_empty(&sb->s_dentry_lru)) {
-                        dentry = list_entry(sb->s_dentry_lru.prev,
-                                        struct dentry, d_lru);
-                        BUG_ON(dentry->d_sb != sb);
-                        spin_lock(&dentry->d_lock);
-                        /*
-                         * If we are honouring the DCACHE_REFERENCED flag and
-                         * the dentry has this flag set, don't free it. Clear
-                         * the flag and put it back on the LRU.
-                         */
-                        if ((flags & DCACHE_REFERENCED)
-                                && (dentry->d_flags & DCACHE_REFERENCED)) {
-                                dentry->d_flags &= ~DCACHE_REFERENCED;
-                                list_move(&dentry->d_lru, &referenced);
-                                spin_unlock(&dentry->d_lock);
-                        } else {
-                                list_move_tail(&dentry->d_lru, &tmp);
-                                spin_unlock(&dentry->d_lock);
-                                cnt--;
-                                if (!cnt)
-                                        break;
-                        }
-                        cond_resched_lock(&dcache_lock);
-                }
-        }
-        while (!list_empty(&tmp)) {
-                dentry = list_entry(tmp.prev, struct dentry, d_lru);
-                dentry_lru_del_init(dentry);
-                spin_lock(&dentry->d_lock);
                /*
                 * We found an inuse dentry which was not removed from
                 * the LRU because of laziness during lookup.  Do not free
                 * it - just keep it off the LRU list.
                 */
+                spin_lock(&dentry->d_lock);
                if (atomic_read(&dentry->d_count)) {
                        spin_unlock(&dentry->d_lock);
                        continue;
@@ -516,13 +477,60 @@ restart:
                /* dentry->d_lock was dropped in prune_one_dentry() */
                cond_resched_lock(&dcache_lock);
        }
-        if (count == NULL && !list_empty(&sb->s_dentry_lru))
+}
-                goto restart;
-        if (count != NULL)
+/**
-                *count = cnt;
+ * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
+ * @sb:         superblock to shrink dentry LRU.
+ * @count:      number of entries to prune
+ * @flags:      flags to control the dentry processing
+ *
+ * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
+ */
+static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
+{
+        /* called from prune_dcache() and shrink_dcache_parent() */
+        struct dentry *dentry;
+        LIST_HEAD(referenced);
+        LIST_HEAD(tmp);
+        int cnt = *count;
+        spin_lock(&dcache_lock);
+        while (!list_empty(&sb->s_dentry_lru)) {
+                dentry = list_entry(sb->s_dentry_lru.prev,
+                                struct dentry, d_lru);
+                BUG_ON(dentry->d_sb != sb);
+                /*
+                 * If we are honouring the DCACHE_REFERENCED flag and the
+                 * dentry has this flag set, don't free it.  Clear the flag
+                 * and put it back on the LRU.
+                 */
+                if (flags & DCACHE_REFERENCED) {
+                        spin_lock(&dentry->d_lock);
+                        if (dentry->d_flags & DCACHE_REFERENCED) {
+                                dentry->d_flags &= ~DCACHE_REFERENCED;
+                                list_move(&dentry->d_lru, &referenced);
+                                spin_unlock(&dentry->d_lock);
+                                cond_resched_lock(&dcache_lock);
+                                continue;
+                        }
+                        spin_unlock(&dentry->d_lock);
+                }
+                list_move_tail(&dentry->d_lru, &tmp);
+                if (!--cnt)
+                        break;
+                cond_resched_lock(&dcache_lock);
+        }
+        *count = cnt;
+        shrink_dentry_list(&tmp);
        if (!list_empty(&referenced))
                list_splice(&referenced, &sb->s_dentry_lru);
        spin_unlock(&dcache_lock);
 }
 /**
@@ -538,7 +546,7 @@ static void prune_dcache(int count)
 {
        struct super_block *sb, *p = NULL;
        int w_count;
-        int unused = dentry_stat.nr_unused;
+        int unused = percpu_counter_sum_positive(&nr_dentry_unused);
        int prune_ratio;
        int pruned;
@@ -608,13 +616,19 @@ static void prune_dcache(int count)
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
 *
- * Shrink the dcache for the specified super block. This
+ * Shrink the dcache for the specified super block. This is used to free
- * is used to free the dcache before unmounting a file
+ * the dcache before unmounting a file system.
- * system
 */
-void shrink_dcache_sb(struct super_block * sb)
+void shrink_dcache_sb(struct super_block *sb)
 {
-        __shrink_dcache_sb(sb, NULL, 0);
+        LIST_HEAD(tmp);
+        spin_lock(&dcache_lock);
+        while (!list_empty(&sb->s_dentry_lru)) {
+                list_splice_init(&sb->s_dentry_lru, &tmp);
+                shrink_dentry_list(&tmp);
+        }
+        spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
@@ -632,7 +646,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
        /* detach this root from the system */
        spin_lock(&dcache_lock);
-        dentry_lru_del_init(dentry);
+        dentry_lru_del(dentry);
        __d_drop(dentry);
        spin_unlock(&dcache_lock);
@@ -646,7 +660,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                        spin_lock(&dcache_lock);
                        list_for_each_entry(loop, &dentry->d_subdirs,
                                            d_u.d_child) {
-                                dentry_lru_del_init(loop);
+                                dentry_lru_del(loop);
                                __d_drop(loop);
                                cond_resched_lock(&dcache_lock);
                        }
@@ -703,20 +717,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                         * otherwise we ascend to the parent and move to the
                         * next sibling if there is one */
                        if (!parent)
-                                goto out;
+                                return;
                        dentry = parent;
                } while (list_empty(&dentry->d_subdirs));
                dentry = list_entry(dentry->d_subdirs.next,
                                    struct dentry, d_u.d_child);
        }
-out:
-        /* several dentries were freed, need to correct nr_dentry */
-        spin_lock(&dcache_lock);
-        dentry_stat.nr_dentry -= detached;
-        spin_unlock(&dcache_lock);
 }
 /*
@@ -830,14 +837,15 @@ resume:
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
-                dentry_lru_del_init(dentry);
                /* 
                 * move only zero ref count dentries to the end 
                 * of the unused list for prune_dcache
                 */
                if (!atomic_read(&dentry->d_count)) {
-                        dentry_lru_add_tail(dentry);
+                        dentry_lru_move_tail(dentry);
                        found++;
+                } else {
+                        dentry_lru_del(dentry);
                }
                /*
@@ -900,12 +908,16 @@ EXPORT_SYMBOL(shrink_dcache_parent);
 */
 static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
+        int nr_unused;
        if (nr) {
                if (!(gfp_mask & __GFP_FS))
                        return -1;
                prune_dcache(nr);
        }
-        return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+        nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+        return (nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker dcache_shrinker = {
@@ -972,9 +984,10 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
        spin_lock(&dcache_lock);
        if (parent)
                list_add(&dentry->d_u.d_child, &parent->d_subdirs);
-        dentry_stat.nr_dentry++;
        spin_unlock(&dcache_lock);
+        percpu_counter_inc(&nr_dentry);
        return dentry;
 }
 EXPORT_SYMBOL(d_alloc);
@@ -1478,33 +1491,26 @@ out:
 * This is used by ncpfs in its readdir implementation.
 * Zero is returned in the dentry is invalid.
 */
- 
+int d_validate(struct dentry *dentry, struct dentry *parent)
-int d_validate(struct dentry *dentry, struct dentry *dparent)
 {
-        struct hlist_head *base;
+        struct hlist_head *head = d_hash(parent, dentry->d_name.hash);
-        struct hlist_node *lhp;
+        struct hlist_node *node;
+        struct dentry *d;
        /* Check whether the ptr might be valid at all.. */
        if (!kmem_ptr_validate(dentry_cache, dentry))
-                goto out;
+                return 0;
+        if (dentry->d_parent != parent)
-        if (dentry->d_parent != dparent)
+                return 0;
-                goto out;
-        spin_lock(&dcache_lock);
+        rcu_read_lock();
-        base = d_hash(dparent, dentry->d_name.hash);
+        hlist_for_each_entry_rcu(d, node, head, d_hash) {
-        hlist_for_each(lhp,base) { 
+                if (d == dentry) {
-                /* hlist_for_each_entry_rcu() not required for d_hash list
+                        dget(dentry);
-                 * as it is parsed under dcache_lock
-                 */
-                if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
-                        __dget_locked(dentry);
-                        spin_unlock(&dcache_lock);
                        return 1;
                }
        }
-        spin_unlock(&dcache_lock);
+        rcu_read_unlock();
-out:
        return 0;
 }
 EXPORT_SYMBOL(d_validate);
@@ -1994,7 +2000,7 @@ global_root:
 * Returns a pointer into the buffer or an error code if the
 * path was too long.
 *
- * "buflen" should be positive. Caller holds the dcache_lock.
+ * "buflen" should be positive.
 *
 * If path is not reachable from the supplied root, then the value of
 * root is changed (without modifying refcounts).
@@ -2006,10 +2012,12 @@ char *__d_path(const struct path *path, struct path *root,
        int error;
        prepend(&res, &buflen, "\0", 1);
+        spin_lock(&dcache_lock);
        error = prepend_path(path, root, &res, &buflen);
+        spin_unlock(&dcache_lock);
        if (error)
                return ERR_PTR(error);
        return res;
 }
@@ -2419,6 +2427,9 @@ static void __init dcache_init(void)
 {
        int loop;
+        percpu_counter_init(&nr_dentry, 0);
+        percpu_counter_init(&nr_dentry_unused, 0);
        /* 
         * A constructor could be added for stable state like the lists,
         * but it is probably not worth it because of the cache nature
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 0210898458b2..89d394d8fe24 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -43,6 +43,7 @@ const struct file_operations debugfs_file_operations = {
        .read =         default_read_file,
        .write =        default_write_file,
        .open =         default_open,
+        .llseek =       noop_llseek,
 };
 static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
@@ -454,6 +455,7 @@ static const struct file_operations fops_bool = {
        .read =         read_file_bool,
        .write =        write_file_bool,
        .open =         default_open,
+        .llseek =       default_llseek,
 };
 /**
@@ -498,6 +500,7 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
 static const struct file_operations fops_blob = {
        .read =         read_file_blob,
        .open =         default_open,
+        .llseek =       default_llseek,
 };
 /**
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30a87b3dbcac..37a8ca7c1222 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,6 +40,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
        struct inode *inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                switch (mode & S_IFMT) {
@@ -134,17 +135,17 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
        return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
 }
-static int debug_get_sb(struct file_system_type *fs_type,
+static struct dentry *debug_mount(struct file_system_type *fs_type,
                        int flags, const char *dev_name,
-                        void *data, struct vfsmount *mnt)
+                        void *data)
 {
-        return get_sb_single(fs_type, flags, data, debug_fill_super, mnt);
+        return mount_single(fs_type, flags, data, debug_fill_super);
 }
 static struct file_system_type debug_fs_type = {
        .owner =        THIS_MODULE,
        .name =         "debugfs",
-        .get_sb =       debug_get_sb,
+        .mount =        debug_mount,
        .kill_sb =      kill_litter_super,
 };
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8b3ffd5b5235..1bb547c9cad6 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -331,7 +331,7 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
 }
 /*
- * devpts_get_sb()
+ * devpts_mount()
 *
 *     If the '-o newinstance' mount option was specified, mount a new
 *     (private) instance of devpts.  PTYs created in this instance are
@@ -345,20 +345,20 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
 *     semantics in devpts while preserving backward compatibility of the
 *     current 'single-namespace' semantics. i.e all mounts of devpts
 *     without the 'newinstance' mount option should bind to the initial
- *     kernel mount, like get_sb_single().
+ *     kernel mount, like mount_single().
 *
 *     Mounts with 'newinstance' option create a new, private namespace.
 *
 *     NOTE:
 *
- *     For single-mount semantics, devpts cannot use get_sb_single(),
+ *     For single-mount semantics, devpts cannot use mount_single(),
- *     because get_sb_single()/sget() find and use the super-block from
+ *     because mount_single()/sget() find and use the super-block from
 *     the most recent mount of devpts. But that recent mount may be a
- *     'newinstance' mount and get_sb_single() would pick the newinstance
+ *     'newinstance' mount and mount_single() would pick the newinstance
 *     super-block instead of the initial super-block.
 */
-static int devpts_get_sb(struct file_system_type *fs_type,
+static struct dentry *devpts_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
        int error;
        struct pts_mount_opts opts;
@@ -366,7 +366,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
        error = parse_mount_options(data, PARSE_MOUNT, &opts);
        if (error)
-                return error;
+                return ERR_PTR(error);
        if (opts.newinstance)
                s = sget(fs_type, NULL, set_anon_super, NULL);
@@ -374,7 +374,7 @@ static int devpts_get_sb(struct file_system_type *fs_type,
                s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
        if (IS_ERR(s))
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        if (!s->s_root) {
                s->s_flags = flags;
@@ -390,13 +390,11 @@ static int devpts_get_sb(struct file_system_type *fs_type,
        if (error)
                goto out_undo_sget;
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 out_undo_sget:
        deactivate_locked_super(s);
-        return error;
+        return ERR_PTR(error);
 }
 #else
@@ -404,10 +402,10 @@ out_undo_sget:
 * This supports only the legacy single-instance semantics (no
 * multiple-instance semantics)
 */
-static int devpts_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
-                const char *dev_name, void *data, struct vfsmount *mnt)
+                const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
+        return mount_single(fs_type, flags, data, devpts_fill_super);
 }
 #endif
@@ -421,7 +419,7 @@ static void devpts_kill_sb(struct super_block *sb)
 static struct file_system_type devpts_fs_type = {
        .name           = "devpts",
-        .get_sb         = devpts_get_sb,
+        .mount          = devpts_mount,
        .kill_sb        = devpts_kill_sb,
 };
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 48d74c7391d1..85882f6ba5f7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
 * filesystems can use it to hold additional state between get_block calls and
 * dio_complete.
 */
-static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
 {
        ssize_t transferred = 0;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index c6cf25158746..6b42ba807dfd 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -643,7 +643,8 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf,
 static const struct file_operations waiters_fops = {
        .owner   = THIS_MODULE,
        .open    = waiters_open,
-        .read    = waiters_read
+        .read    = waiters_read,
+        .llseek  = default_llseek,
 };
 void dlm_delete_debug_file(struct dlm_ls *ls)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 031dbe3a15ca..64e5f3efdd81 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1846,6 +1846,9 @@ static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
        struct dlm_lkb *gr;
        list_for_each_entry(gr, head, lkb_statequeue) {
+                /* skip self when sending basts to convertqueue */
+                if (gr == lkb)
+                        continue;
                if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
                        queue_bast(r, gr, lkb->lkb_rqmode);
                        gr->lkb_highbast = lkb->lkb_rqmode;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d45c02db6943..30d8b85febbf 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -412,7 +412,8 @@ static const struct file_operations dev_fops = {
        .read    = dev_read,
        .write   = dev_write,
        .poll    = dev_poll,
-        .owner   = THIS_MODULE
+        .owner   = THIS_MODULE,
+        .llseek  = noop_llseek,
 };
 static struct miscdevice plock_dev_misc = {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b6272853130c..66d6c16bf440 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1009,6 +1009,7 @@ static const struct file_operations device_fops = {
        .write   = device_write,
        .poll    = device_poll,
        .owner   = THIS_MODULE,
+        .llseek  = noop_llseek,
 };
 static const struct file_operations ctl_device_fops = {
@@ -1017,6 +1018,7 @@ static const struct file_operations ctl_device_fops = {
        .read    = device_read,
        .write   = device_write,
        .owner   = THIS_MODULE,
+        .llseek  = noop_llseek,
 };
 static struct miscdevice ctl_device = {
@@ -1029,6 +1031,7 @@ static const struct file_operations monitor_device_fops = {
        .open    = monitor_device_open,
        .release = monitor_device_close,
        .owner   = THIS_MODULE,
+        .llseek  = noop_llseek,
 };
 static struct miscdevice monitor_device = {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 0032a9f5a3a9..413a3c48f0bb 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -377,6 +377,7 @@ struct ecryptfs_mount_crypt_stat {
 #define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES      0x00000010
 #define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK   0x00000020
 #define ECRYPTFS_GLOBAL_ENCFN_USE_FEK          0x00000040
+#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY    0x00000080
        u32 flags;
        struct list_head global_auth_tok_list;
        struct mutex global_auth_tok_list_mutex;
@@ -477,7 +478,7 @@ ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
 static inline struct ecryptfs_file_info *
 ecryptfs_file_to_private(struct file *file)
 {
-        return (struct ecryptfs_file_info *)file->private_data;
+        return file->private_data;
 }
 static inline void
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 622c95140802..91da02987bff 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,7 +31,6 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
-#include <linux/smp_lock.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -284,11 +283,9 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
        int rc = 0;
        struct file *lower_file = NULL;
-        lock_kernel();
        lower_file = ecryptfs_file_to_lower(file);
        if (lower_file->f_op && lower_file->f_op->fasync)
                rc = lower_file->f_op->fasync(fd, lower_file, flag);
-        unlock_kernel();
        return rc;
 }
@@ -332,6 +329,7 @@ const struct file_operations ecryptfs_dir_fops = {
        .fsync = ecryptfs_fsync,
        .fasync = ecryptfs_fasync,
        .splice_read = generic_file_splice_read,
+        .llseek = default_llseek,
 };
 const struct file_operations ecryptfs_main_fops = {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 3fbc94203380..9d1a22d62765 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -32,6 +32,7 @@
 #include <linux/crypto.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
+#include <linux/xattr.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
@@ -70,15 +71,19 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
        struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
        struct dentry *dentry_save;
        struct vfsmount *vfsmount_save;
+        unsigned int flags_save;
        int rc;
        dentry_save = nd->path.dentry;
        vfsmount_save = nd->path.mnt;
+        flags_save = nd->flags;
        nd->path.dentry = lower_dentry;
        nd->path.mnt = lower_mnt;
+        nd->flags &= ~LOOKUP_OPEN;
        rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
        nd->path.dentry = dentry_save;
        nd->path.mnt = vfsmount_save;
+        nd->flags = flags_save;
        return rc;
 }
@@ -1108,10 +1113,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                rc = -EOPNOTSUPP;
                goto out;
        }
-        mutex_lock(&lower_dentry->d_inode->i_mutex);
-        rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value,
+        rc = vfs_setxattr(lower_dentry, name, value, size, flags);
-                                                   size, flags);
-        mutex_unlock(&lower_dentry->d_inode->i_mutex);
 out:
        return rc;
 }
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 73811cfa2ea4..b1f6858a5223 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -446,6 +446,7 @@ out:
 */
 static int
 ecryptfs_find_auth_tok_for_sig(
+        struct key **auth_tok_key,
        struct ecryptfs_auth_tok **auth_tok,
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
        char *sig)
@@ -453,12 +454,21 @@ ecryptfs_find_auth_tok_for_sig(
        struct ecryptfs_global_auth_tok *global_auth_tok;
        int rc = 0;
+        (*auth_tok_key) = NULL;
        (*auth_tok) = NULL;
        if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
                                                  mount_crypt_stat, sig)) {
-                struct key *auth_tok_key;
-                rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
+                /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the
+                 * mount_crypt_stat structure, we prevent to use auth toks that
+                 * are not inserted through the ecryptfs_add_global_auth_tok
+                 * function.
+                 */
+                if (mount_crypt_stat->flags
+                                & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+                        return -EINVAL;
+                rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok,
                                                       sig);
        } else
                (*auth_tok) = global_auth_tok->global_auth_tok;
@@ -509,6 +519,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                             char *filename, size_t filename_size)
 {
        struct ecryptfs_write_tag_70_packet_silly_stack *s;
+        struct key *auth_tok_key = NULL;
        int rc = 0;
        s = kmalloc(sizeof(*s), GFP_KERNEL);
@@ -606,6 +617,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
        }
        dest[s->i++] = s->cipher_code;
        rc = ecryptfs_find_auth_tok_for_sig(
+                &auth_tok_key,
                &s->auth_tok, mount_crypt_stat,
                mount_crypt_stat->global_default_fnek_sig);
        if (rc) {
@@ -753,6 +765,8 @@ out_free_unlock:
 out_unlock:
        mutex_unlock(s->tfm_mutex);
 out:
+        if (auth_tok_key)
+                key_put(auth_tok_key);
        kfree(s);
        return rc;
 }
@@ -798,6 +812,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
                             char *data, size_t max_packet_size)
 {
        struct ecryptfs_parse_tag_70_packet_silly_stack *s;
+        struct key *auth_tok_key = NULL;
        int rc = 0;
        (*packet_size) = 0;
@@ -910,7 +925,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
         * >= ECRYPTFS_MAX_IV_BYTES. */
        memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
        s->desc.info = s->iv;
-        rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
+        rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+                                            &s->auth_tok, mount_crypt_stat,
                                            s->fnek_sig_hex);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to find auth tok for "
@@ -986,6 +1002,8 @@ out:
                (*filename_size) = 0;
                (*filename) = NULL;
        }
+        if (auth_tok_key)
+                key_put(auth_tok_key);
        kfree(s);
        return rc;
 }
@@ -1557,14 +1575,19 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
                       ECRYPTFS_VERSION_MAJOR,
                       ECRYPTFS_VERSION_MINOR);
                rc = -EINVAL;
-                goto out;
+                goto out_release_key;
        }
        if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
            && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
                printk(KERN_ERR "Invalid auth_tok structure "
                       "returned from key query\n");
                rc = -EINVAL;
-                goto out;
+                goto out_release_key;
+        }
+out_release_key:
+        if (rc) {
+                key_put(*auth_tok_key);
+                (*auth_tok_key) = NULL;
        }
 out:
        return rc;
@@ -1688,6 +1711,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
        struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
        size_t tag_11_contents_size;
        size_t tag_11_packet_size;
+        struct key *auth_tok_key = NULL;
        int rc = 0;
        INIT_LIST_HEAD(&auth_tok_list);
@@ -1784,6 +1808,10 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
         * just one will be sufficient to decrypt to get the FEK. */
 find_next_matching_auth_tok:
        found_auth_tok = 0;
+        if (auth_tok_key) {
+                key_put(auth_tok_key);
+                auth_tok_key = NULL;
+        }
        list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) {
                candidate_auth_tok = &auth_tok_list_item->auth_tok;
                if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1800,10 +1828,11 @@ find_next_matching_auth_tok:
                        rc = -EINVAL;
                        goto out_wipe_list;
                }
-                ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
+                rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+                                               &matching_auth_tok,
                                               crypt_stat->mount_crypt_stat,
                                               candidate_auth_tok_sig);
-                if (matching_auth_tok) {
+                if (!rc) {
                        found_auth_tok = 1;
                        goto found_matching_auth_tok;
                }
@@ -1866,6 +1895,8 @@ found_matching_auth_tok:
 out_wipe_list:
        wipe_auth_tok_list(&auth_tok_list);
 out:
+        if (auth_tok_key)
+                key_put(auth_tok_key);
        return rc;
 }
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index cbd4e18adb20..a9dbd62518e6 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -208,7 +208,8 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
       ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
-       ecryptfs_opt_unlink_sigs, ecryptfs_opt_err };
+       ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
+       ecryptfs_opt_err };
 static const match_table_t tokens = {
        {ecryptfs_opt_sig, "sig=%s"},
@@ -223,6 +224,7 @@ static const match_table_t tokens = {
        {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
        {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
        {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
+        {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
        {ecryptfs_opt_err, NULL}
 };
@@ -406,6 +408,10 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
                case ecryptfs_opt_unlink_sigs:
                        mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS;
                        break;
+                case ecryptfs_opt_mount_auth_tok_only:
+                        mount_crypt_stat->flags |=
+                                ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
+                        break;
                case ecryptfs_opt_err:
                default:
                        printk(KERN_WARNING
@@ -540,9 +546,8 @@ out:
 *                        ecryptfs_interpose to perform most of the linking
 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
 */
-static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
-                        const char *dev_name, void *raw_data,
+                        const char *dev_name, void *raw_data)
-                        struct vfsmount *mnt)
 {
        struct super_block *s;
        struct ecryptfs_sb_info *sbi;
@@ -607,8 +612,7 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
                err = "Reading sb failed";
                goto out;
        }
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 out:
        if (sbi) {
@@ -616,7 +620,7 @@ out:
                kmem_cache_free(ecryptfs_sb_info_cache, sbi);
        }
        printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
-        return rc;
+        return ERR_PTR(rc);
 }
 /**
@@ -639,7 +643,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
 static struct file_system_type ecryptfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "ecryptfs",
-        .get_sb = ecryptfs_get_sb,
+        .mount = ecryptfs_mount,
        .kill_sb = ecryptfs_kill_block_super,
        .fs_flags = 0
 };
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 00208c3d7e92..940a82e63dc3 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -482,6 +482,7 @@ static const struct file_operations ecryptfs_miscdev_fops = {
        .read    = ecryptfs_miscdev_read,
        .write   = ecryptfs_miscdev_write,
        .release = ecryptfs_miscdev_release,
+        .llseek  = noop_llseek,
 };
 static struct miscdevice ecryptfs_miscdev = {
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index f7fc286a3aa9..253732382d37 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -180,6 +180,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_printf(m, ",ecryptfs_encrypted_view");
        if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS)
                seq_printf(m, ",ecryptfs_unlink_sigs");
+        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
+                seq_printf(m, ",ecryptfs_mount_auth_tok_only");
        return 0;
 }
diff --git a/fs/efs/super.c b/fs/efs/super.c
index f04942810818..5073a07652cc 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -20,16 +20,16 @@
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
-static int efs_get_sb(struct file_system_type *fs_type,
+static struct dentry *efs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
 }
 static struct file_system_type efs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "efs",
-        .get_sb         = efs_get_sb,
+        .mount          = efs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 6bd3f76fdf88..e0194b3e14d6 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -293,6 +293,7 @@ static const struct file_operations eventfd_fops = {
        .poll           = eventfd_poll,
        .read           = eventfd_read,
        .write          = eventfd_write,
+        .llseek         = noop_llseek,
 };
 /**
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3817149919cb..8cf07242067d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -77,9 +77,6 @@
 /* Maximum number of nesting allowed inside epoll sets */
 #define EP_MAX_NESTS 4
-/* Maximum msec timeout value storeable in a long int */
-#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
 #define EP_UNACTIVE_PTR ((void *) -1L)
@@ -674,7 +671,8 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 /* File callbacks that implement the eventpoll file behaviour */
 static const struct file_operations eventpoll_fops = {
        .release        = ep_eventpoll_release,
-        .poll           = ep_eventpoll_poll
+        .poll           = ep_eventpoll_poll,
+        .llseek         = noop_llseek,
 };
 /* Fast test to see if the file is an evenpoll file */
@@ -1116,18 +1114,22 @@ static int ep_send_events(struct eventpoll *ep,
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                   int maxevents, long timeout)
 {
-        int res, eavail;
+        int res, eavail, timed_out = 0;
        unsigned long flags;
-        long jtimeout;
+        long slack;
        wait_queue_t wait;
+        struct timespec end_time;
-        /*
+        ktime_t expires, *to = NULL;
-         * Calculate the timeout by checking for the "infinite" value (-1)
-         * and the overflow condition. The passed timeout is in milliseconds,
+        if (timeout > 0) {
-         * that why (t * HZ) / 1000.
+                ktime_get_ts(&end_time);
-         */
+                timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC);
-        jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
+                slack = select_estimate_accuracy(&end_time);
-                MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
+                to = &expires;
+                *to = timespec_to_ktime(end_time);
+        } else if (timeout == 0) {
+                timed_out = 1;
+        }
 retry:
        spin_lock_irqsave(&ep->lock, flags);
@@ -1149,7 +1151,7 @@ retry:
                         * to TASK_INTERRUPTIBLE before doing the checks.
                         */
                        set_current_state(TASK_INTERRUPTIBLE);
-                        if (!list_empty(&ep->rdllist) || !jtimeout)
+                        if (!list_empty(&ep->rdllist) || timed_out)
                                break;
                        if (signal_pending(current)) {
                                res = -EINTR;
@@ -1157,7 +1159,9 @@ retry:
                        }
                        spin_unlock_irqrestore(&ep->lock, flags);
-                        jtimeout = schedule_timeout(jtimeout);
+                        if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+                                timed_out = 1;
                        spin_lock_irqsave(&ep->lock, flags);
                }
                __remove_wait_queue(&ep->wq, &wait);
@@ -1175,7 +1179,7 @@ retry:
         * more luck.
         */
        if (!res && eavail &&
-            !(res = ep_send_events(ep, events, maxevents)) && jtimeout)
+            !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
                goto retry;
        return res;
diff --git a/fs/exec.c b/fs/exec.c
index 6d2b6f936858..99d33a1371e9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -54,6 +54,7 @@
 #include <linux/fsnotify.h>
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -65,6 +66,12 @@ char core_pattern[CORENAME_MAX_SIZE] = "core";
 unsigned int core_pipe_limit;
 int suid_dumpable = 0;
+struct core_name {
+        char *corename;
+        int used, size;
+};
+static atomic_t call_count = ATOMIC_INIT(1);
 /* The maximal length of core_pattern is also specified in sysctl.c */
 static LIST_HEAD(formats);
@@ -759,6 +766,10 @@ static int exec_mmap(struct mm_struct *mm)
        tsk->mm = mm;
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
+        if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+                atomic_dec(&old_mm->oom_disable_count);
+                atomic_inc(&tsk->mm->oom_disable_count);
+        }
        task_unlock(tsk);
        arch_pick_mmap_layout(mm);
        if (old_mm) {
@@ -998,7 +1009,7 @@ int flush_old_exec(struct linux_binprm * bprm)
        bprm->mm = NULL;                /* We're using it now */
-        current->flags &= ~PF_RANDOMIZE;
+        current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
        flush_thread();
        current->personality &= ~bprm->per_clear;
@@ -1078,14 +1089,14 @@ EXPORT_SYMBOL(setup_new_exec);
 */
 int prepare_bprm_creds(struct linux_binprm *bprm)
 {
-        if (mutex_lock_interruptible(&current->cred_guard_mutex))
+        if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
                return -ERESTARTNOINTR;
        bprm->cred = prepare_exec_creds();
        if (likely(bprm->cred))
                return 0;
-        mutex_unlock(&current->cred_guard_mutex);
+        mutex_unlock(&current->signal->cred_guard_mutex);
        return -ENOMEM;
 }
@@ -1093,7 +1104,7 @@ void free_bprm(struct linux_binprm *bprm)
 {
        free_arg_pages(bprm);
        if (bprm->cred) {
-                mutex_unlock(&current->cred_guard_mutex);
+                mutex_unlock(&current->signal->cred_guard_mutex);
                abort_creds(bprm->cred);
        }
        kfree(bprm);
@@ -1114,13 +1125,13 @@ void install_exec_creds(struct linux_binprm *bprm)
         * credentials; any time after this it may be unlocked.
         */
        security_bprm_committed_creds(bprm);
-        mutex_unlock(&current->cred_guard_mutex);
+        mutex_unlock(&current->signal->cred_guard_mutex);
 }
 EXPORT_SYMBOL(install_exec_creds);
 /*
 * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_guard_mutex to protect against
+ * - the caller must hold ->cred_guard_mutex to protect against
 *   PTRACE_ATTACH
 */
 int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1401,7 +1412,6 @@ int do_execve(const char * filename,
        if (retval < 0)
                goto out;
-        current->flags &= ~PF_KTHREAD;
        retval = search_binary_handler(bprm,regs);
        if (retval < 0)
                goto out;
@@ -1454,127 +1464,148 @@ void set_binfmt(struct linux_binfmt *new)
 EXPORT_SYMBOL(set_binfmt);
+static int expand_corename(struct core_name *cn)
+{
+        char *old_corename = cn->corename;
+        cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
+        cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
+        if (!cn->corename) {
+                kfree(old_corename);
+                return -ENOMEM;
+        }
+        return 0;
+}
+static int cn_printf(struct core_name *cn, const char *fmt, ...)
+{
+        char *cur;
+        int need;
+        int ret;
+        va_list arg;
+        va_start(arg, fmt);
+        need = vsnprintf(NULL, 0, fmt, arg);
+        va_end(arg);
+        if (likely(need < cn->size - cn->used - 1))
+                goto out_printf;
+        ret = expand_corename(cn);
+        if (ret)
+                goto expand_fail;
+out_printf:
+        cur = cn->corename + cn->used;
+        va_start(arg, fmt);
+        vsnprintf(cur, need + 1, fmt, arg);
+        va_end(arg);
+        cn->used += need;
+        return 0;
+expand_fail:
+        return ret;
+}
 /* format_corename will inspect the pattern parameter, and output a
 * name into corename, which must have space for at least
 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
 */
-static int format_corename(char *corename, long signr)
+static int format_corename(struct core_name *cn, long signr)
 {
        const struct cred *cred = current_cred();
        const char *pat_ptr = core_pattern;
        int ispipe = (*pat_ptr == '|');
-        char *out_ptr = corename;
-        char *const out_end = corename + CORENAME_MAX_SIZE;
-        int rc;
        int pid_in_pattern = 0;
+        int err = 0;
+        cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
+        cn->corename = kmalloc(cn->size, GFP_KERNEL);
+        cn->used = 0;
+        if (!cn->corename)
+                return -ENOMEM;
        /* Repeat as long as we have more pattern to process and more output
           space */
        while (*pat_ptr) {
                if (*pat_ptr != '%') {
-                        if (out_ptr == out_end)
+                        if (*pat_ptr == 0)
                                goto out;
-                        *out_ptr++ = *pat_ptr++;
+                        err = cn_printf(cn, "%c", *pat_ptr++);
                } else {
                        switch (*++pat_ptr) {
+                        /* single % at the end, drop that */
                        case 0:
                                goto out;
                        /* Double percent, output one percent */
                        case '%':
-                                if (out_ptr == out_end)
+                                err = cn_printf(cn, "%c", '%');
-                                        goto out;
-                                *out_ptr++ = '%';
                                break;
                        /* pid */
                        case 'p':
                                pid_in_pattern = 1;
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%d",
-                                              "%d", task_tgid_vnr(current));
+                                              task_tgid_vnr(current));
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* uid */
                        case 'u':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%d", cred->uid);
-                                              "%d", cred->uid);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* gid */
                        case 'g':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%d", cred->gid);
-                                              "%d", cred->gid);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* signal that caused the coredump */
                        case 's':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%ld", signr);
-                                              "%ld", signr);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* UNIX time of coredump */
                        case 't': {
                                struct timeval tv;
                                do_gettimeofday(&tv);
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%lu", tv.tv_sec);
-                                              "%lu", tv.tv_sec);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        }
                        /* hostname */
                        case 'h':
                                down_read(&uts_sem);
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%s",
-                                              "%s", utsname()->nodename);
+                                              utsname()->nodename);
                                up_read(&uts_sem);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* executable */
                        case 'e':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%s", current->comm);
-                                              "%s", current->comm);
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        /* core limit size */
                        case 'c':
-                                rc = snprintf(out_ptr, out_end - out_ptr,
+                                err = cn_printf(cn, "%lu",
-                                              "%lu", rlimit(RLIMIT_CORE));
+                                              rlimit(RLIMIT_CORE));
-                                if (rc > out_end - out_ptr)
-                                        goto out;
-                                out_ptr += rc;
                                break;
                        default:
                                break;
                        }
                        ++pat_ptr;
                }
+                if (err)
+                        return err;
        }
        /* Backward compatibility with core_uses_pid:
         *
         * If core_pattern does not include a %p (as is the default)
         * and core_uses_pid is set, then .%pid will be appended to
         * the filename. Do not do this for piped commands. */
        if (!ispipe && !pid_in_pattern && core_uses_pid) {
-                rc = snprintf(out_ptr, out_end - out_ptr,
+                err = cn_printf(cn, ".%d", task_tgid_vnr(current));
-                              ".%d", task_tgid_vnr(current));
+                if (err)
-                if (rc > out_end - out_ptr)
+                        return err;
-                        goto out;
-                out_ptr += rc;
        }
 out:
-        *out_ptr = 0;
        return ispipe;
 }
@@ -1851,7 +1882,7 @@ static int umh_pipe_setup(struct subprocess_info *info)
 void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
        struct core_state core_state;
-        char corename[CORENAME_MAX_SIZE + 1];
+        struct core_name cn;
        struct mm_struct *mm = current->mm;
        struct linux_binfmt * binfmt;
        const struct cred *old_cred;
@@ -1906,7 +1937,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
         */
        clear_thread_flag(TIF_SIGPENDING);
-        ispipe = format_corename(corename, signr);
+        ispipe = format_corename(&cn, signr);
+        if (ispipe == -ENOMEM) {
+                printk(KERN_WARNING "format_corename failed\n");
+                printk(KERN_WARNING "Aborting core\n");
+                goto fail_corename;
+        }
        if (ispipe) {
                int dump_count;
@@ -1943,7 +1980,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                        goto fail_dropcount;
                }
-                helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
+                helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
                if (!helper_argv) {
                        printk(KERN_WARNING "%s failed to allocate memory\n",
                               __func__);
@@ -1956,7 +1993,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                argv_free(helper_argv);
                if (retval) {
                        printk(KERN_INFO "Core dump to %s pipe failed\n",
-                               corename);
+                               cn.corename);
                        goto close_fail;
                }
        } else {
@@ -1965,7 +2002,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                if (cprm.limit < binfmt->min_coredump)
                        goto fail_unlock;
-                cprm.file = filp_open(corename,
+                cprm.file = filp_open(cn.corename,
                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
                                 0600);
                if (IS_ERR(cprm.file))
@@ -2007,6 +2044,8 @@ fail_dropcount:
        if (ispipe)
                atomic_dec(&core_dump_count);
 fail_unlock:
+        kfree(cn.corename);
+fail_corename:
        coredump_finish(mm);
        revert_creds(old_cred);
 fail_creds:
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index d91e9d829bc1..dcc941d82d67 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -420,7 +420,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
        err = exofs_write_begin(NULL, page->mapping, pos, len,
                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err)
-                EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
+                EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
                          err);
        de->inode_no = cpu_to_le64(inode->i_ino);
@@ -556,7 +556,7 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
        err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
                                                        &page, NULL);
        if (err)
-                EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
+                EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n",
                          err);
        if (pde)
                pde->rec_len = cpu_to_le16(to - from);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 68cb23e3bb98..b905c79b4f0a 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -46,10 +46,6 @@ static int exofs_file_fsync(struct file *filp, int datasync)
 {
        int ret;
        struct inode *inode = filp->f_mapping->host;
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 0, /* metadata-only; caller takes care of data */
-        };
        struct super_block *sb;
        if (!(inode->i_state & I_DIRTY))
@@ -57,7 +53,7 @@ static int exofs_file_fsync(struct file *filp, int datasync)
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                return 0;
-        ret = sync_inode(inode, &wbc);
+        ret = sync_inode_metadata(inode, 1);
        /* This is a good place to write the sb */
        /* TODO: Sechedule an sb-sync on create */
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 3eadd97324b1..42685424817b 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -185,7 +185,7 @@ static void update_write_page(struct page *page, int ret)
 /* Called at the end of reads, to optionally unlock pages and update their
 * status.
 */
-static int __readpages_done(struct page_collect *pcol, bool do_unlock)
+static int __readpages_done(struct page_collect *pcol)
 {
        int i;
        u64 resid;
@@ -221,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
                          page_stat ? "bad_bytes" : "good_bytes");
                ret = update_read_page(page, page_stat);
-                if (do_unlock)
+                if (!pcol->read_4_write)
                        unlock_page(page);
                length += PAGE_SIZE;
        }
@@ -236,7 +236,7 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
 {
        struct page_collect *pcol = p;
-        __readpages_done(pcol, true);
+        __readpages_done(pcol);
        atomic_dec(&pcol->sbi->s_curr_pending);
        kfree(pcol);
 }
@@ -257,7 +257,7 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
        }
 }
-static int read_exec(struct page_collect *pcol, bool is_sync)
+static int read_exec(struct page_collect *pcol)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
        struct exofs_io_state *ios = pcol->ios;
@@ -267,17 +267,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
        if (!pcol->pages)
                return 0;
-        /* see comment in _readpage() about sync reads */
-        WARN_ON(is_sync && (pcol->nr_pages != 1));
        ios->pages = pcol->pages;
        ios->nr_pages = pcol->nr_pages;
        ios->length = pcol->length;
        ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
-        if (is_sync) {
+        if (pcol->read_4_write) {
                exofs_oi_read(oi, pcol->ios);
-                return __readpages_done(pcol, false);
+                return __readpages_done(pcol);
        }
        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -303,7 +300,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
        return 0;
 err:
-        if (!is_sync)
+        if (!pcol->read_4_write)
                _unlock_pcol_pages(pcol, ret, READ);
        pcol_free(pcol);
@@ -356,7 +353,7 @@ static int readpage_strip(void *data, struct page *page)
                EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
                             " splitting\n", inode->i_ino, page->index);
-                return read_exec(pcol, false);
+                return read_exec(pcol);
        }
 try_again:
@@ -366,7 +363,7 @@ try_again:
        } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
                   page->index)) {
                /* Discontinuity detected, split the request */
-                ret = read_exec(pcol, false);
+                ret = read_exec(pcol);
                if (unlikely(ret))
                        goto fail;
                goto try_again;
@@ -391,7 +388,7 @@ try_again:
                          page, len, pcol->nr_pages, pcol->length);
                /* split the request, and start again with current page */
-                ret = read_exec(pcol, false);
+                ret = read_exec(pcol);
                if (unlikely(ret))
                        goto fail;
@@ -420,27 +417,24 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
                return ret;
        }
-        return read_exec(&pcol, false);
+        return read_exec(&pcol);
 }
-static int _readpage(struct page *page, bool is_sync)
+static int _readpage(struct page *page, bool read_4_write)
 {
        struct page_collect pcol;
        int ret;
        _pcol_init(&pcol, 1, page->mapping->host);
-        /* readpage_strip might call read_exec(,is_sync==false) at several
+        pcol.read_4_write = read_4_write;
-         * places but not if we have a single page.
-         */
-        pcol.read_4_write = is_sync;
        ret = readpage_strip(&pcol, page);
        if (ret) {
                EXOFS_ERR("_readpage => %d\n", ret);
                return ret;
        }
-        return read_exec(&pcol, is_sync);
+        return read_exec(&pcol);
 }
 /*
@@ -511,7 +505,7 @@ static int write_exec(struct page_collect *pcol)
        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
        if (!pcol_copy) {
-                EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
+                EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
                ret = -ENOMEM;
                goto err;
        }
@@ -527,7 +521,7 @@ static int write_exec(struct page_collect *pcol)
        ret = exofs_oi_write(oi, ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("write_exec: exofs_oi_write() Faild\n");
+                EXOFS_ERR("write_exec: exofs_oi_write() Failed\n");
                goto err;
        }
@@ -628,7 +622,7 @@ try_again:
                /* split the request, next loop will start again */
                ret = write_exec(pcol);
                if (unlikely(ret)) {
-                        EXOFS_DBGMSG("write_exec faild => %d", ret);
+                        EXOFS_DBGMSG("write_exec failed => %d", ret);
                        goto fail;
                }
@@ -719,7 +713,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
                ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
                                         fsdata);
                if (ret) {
-                        EXOFS_DBGMSG("simple_write_begin faild\n");
+                        EXOFS_DBGMSG("simple_write_begin failed\n");
                        goto out;
                }
@@ -732,7 +726,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
                if (ret) {
                        /*SetPageError was done by _readpage. Is it ok?*/
                        unlock_page(page);
-                        EXOFS_DBGMSG("__readpage_filler faild\n");
+                        EXOFS_DBGMSG("__readpage_filler failed\n");
                }
        }
 out:
@@ -1036,6 +1030,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
                memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
        }
+        inode->i_mapping->backing_dev_info = sb->s_bdi;
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &exofs_file_inode_operations;
                inode->i_fop = &exofs_file_operations;
@@ -1072,8 +1067,10 @@ bad_inode:
 int __exofs_wait_obj_created(struct exofs_i_info *oi)
 {
        if (!obj_created(oi)) {
+                EXOFS_DBGMSG("!obj_created\n");
                BUG_ON(!obj_2bcreated(oi));
                wait_event(oi->i_wq, obj_created(oi));
+                EXOFS_DBGMSG("wait_event done\n");
        }
        return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
 }
@@ -1095,7 +1092,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
        atomic_dec(&sbi->s_curr_pending);
        if (unlikely(ret)) {
-                EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
+                EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
                          _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
                /*TODO: When FS is corrupted creation can fail, object already
                 * exist. Get rid of this asynchronous creation, if exist
@@ -1107,7 +1104,6 @@ static void create_done(struct exofs_io_state *ios, void *p)
        set_obj_created(oi);
-        atomic_dec(&inode->i_count);
        wake_up(&oi->i_wq);
 }
@@ -1135,6 +1131,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        sbi = sb->s_fs_info;
+        inode->i_mapping->backing_dev_info = sb->s_bdi;
        sb->s_dirt = 1;
        inode_init_owner(inode, dir, mode);
        inode->i_ino = sbi->s_nextid++;
@@ -1157,17 +1154,11 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        ios->obj.id = exofs_oi_objno(oi);
        exofs_make_credential(oi->i_cred, &ios->obj);
-        /* increment the refcount so that the inode will still be around when we
-         * reach the callback
-         */
-        atomic_inc(&inode->i_count);
        ios->done = create_done;
        ios->private = inode;
        ios->cred = oi->i_cred;
        ret = exofs_sbi_create(ios);
        if (ret) {
-                atomic_dec(&inode->i_count);
                exofs_put_io_state(ios);
                return ERR_PTR(ret);
        }
@@ -1215,7 +1206,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        args = kzalloc(sizeof(*args), GFP_KERNEL);
        if (!args) {
-                EXOFS_DBGMSG("Faild kzalloc of args\n");
+                EXOFS_DBGMSG("Failed kzalloc of args\n");
                return -ENOMEM;
        }
@@ -1257,12 +1248,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        ios->out_attr_len = 1;
        ios->out_attr = &attr;
-        if (!obj_created(oi)) {
+        wait_obj_created(oi);
-                EXOFS_DBGMSG("!obj_created\n");
-                BUG_ON(!obj_2bcreated(oi));
-                wait_event(oi->i_wq, obj_created(oi));
-                EXOFS_DBGMSG("wait_event done\n");
-        }
        if (!do_sync) {
                args->sbi = sbi;
@@ -1325,12 +1311,12 @@ void exofs_evict_inode(struct inode *inode)
        inode->i_size = 0;
        end_writeback(inode);
-        /* if we are deleting an obj that hasn't been created yet, wait */
+        /* if we are deleting an obj that hasn't been created yet, wait.
-        if (!obj_created(oi)) {
+         * This also makes sure that create_done cannot be called with an
-                BUG_ON(!obj_2bcreated(oi));
+         * already evicted inode.
-                wait_event(oi->i_wq, obj_created(oi));
+         */
-                /* ignore the error attempt a remove anyway */
+        wait_obj_created(oi);
-        }
+        /* ignore the error, attempt a remove anyway */
        /* Now Remove the OSD objects */
        ret = exofs_get_io_state(&sbi->layout, &ios);
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 6550bf70e41d..f74a2ec027a6 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -55,7 +55,7 @@ int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
        ret = osd_finalize_request(or, 0, cred, NULL);
        if (unlikely(ret)) {
-                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+                EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
                goto out;
        }
@@ -79,7 +79,7 @@ int exofs_get_io_state(struct exofs_layout *layout,
         */
        ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
        if (unlikely(!ios)) {
-                EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
+                EXOFS_DBGMSG("Failed kzalloc bytes=%d\n",
                             exofs_io_state_size(layout->s_numdevs));
                *pios = NULL;
                return -ENOMEM;
@@ -172,7 +172,7 @@ static int exofs_io_execute(struct exofs_io_state *ios)
                ret = osd_finalize_request(or, 0, ios->cred, NULL);
                if (unlikely(ret)) {
-                        EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
+                        EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n",
                                     ret);
                        return ret;
                }
@@ -361,7 +361,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
                per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
                if (unlikely(!per_dev->bio)) {
-                        EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
+                        EXOFS_DBGMSG("Failed to allocate BIO size=%u\n",
                                     bio_size);
                        return -ENOMEM;
                }
@@ -564,7 +564,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
                                                  master_dev->bio->bi_max_vecs);
                                if (unlikely(!bio)) {
                                        EXOFS_DBGMSG(
-                                              "Faild to allocate BIO size=%u\n",
+                                              "Failed to allocate BIO size=%u\n",
                                              master_dev->bio->bi_max_vecs);
                                        ret = -ENOMEM;
                                        goto out;
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index b7dd0c236863..264e95d02830 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
        inode->i_ctime = CURRENT_TIME;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        return exofs_add_nondir(dentry, inode);
 }
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 047e92fa3af8..79c3ae6e0456 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -659,19 +659,19 @@ free_bdi:
 /*
 * Set up the superblock (calls exofs_fill_super eventually)
 */
-static int exofs_get_sb(struct file_system_type *type,
+static struct dentry *exofs_mount(struct file_system_type *type,
                          int flags, const char *dev_name,
-                          void *data, struct vfsmount *mnt)
+                          void *data)
 {
        struct exofs_mountopt opts;
        int ret;
        ret = parse_options(data, &opts);
        if (ret)
-                return ret;
+                return ERR_PTR(ret);
        opts.dev_name = dev_name;
-        return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
+        return mount_nodev(type, flags, &opts, exofs_fill_super);
 }
 /*
@@ -809,7 +809,7 @@ static const struct export_operations exofs_export_ops = {
 static struct file_system_type exofs_type = {
        .owner          = THIS_MODULE,
        .name           = "exofs",
-        .get_sb         = exofs_get_sb,
+        .mount          = exofs_mount,
        .kill_sb        = generic_shutdown_super,
 };
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e9e175949a63..51b304056f10 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -74,21 +74,20 @@ static struct dentry *
 find_disconnected_root(struct dentry *dentry)
 {
        dget(dentry);
-        spin_lock(&dentry->d_lock);
+        while (!IS_ROOT(dentry)) {
-        while (!IS_ROOT(dentry) &&
+                struct dentry *parent = dget_parent(dentry);
-               (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) {
-                struct dentry *parent = dentry->d_parent;
+                if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
-                dget(parent);
+                        dput(parent);
-                spin_unlock(&dentry->d_lock);
+                        break;
+                }
                dput(dentry);
                dentry = parent;
-                spin_lock(&dentry->d_lock);
        }
-        spin_unlock(&dentry->d_lock);
        return dentry;
 }
 /*
 * Make sure target_dir is fully connected to the dentry tree.
 *
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index c6c684b44ea1..0d06f4e75699 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -646,10 +646,9 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
        return here;
 }
-/*
+/**
 * ext2_try_to_allocate()
 * @sb:                 superblock
- * @handle:             handle to this transaction
 * @group:              given allocation block group
 * @bitmap_bh:          bufferhead holds the block bitmap
 * @grp_goal:           given target block within the group
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 764109886ec0..2709b34206ab 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -98,7 +98,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
        if (IS_DIRSYNC(dir)) {
                err = write_one_page(page, 1);
                if (!err)
-                        err = ext2_sync_inode(dir);
+                        err = sync_inode_metadata(dir, 1);
        } else {
                unlock_page(page);
        }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 416daa62242c..6346a2acf326 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -120,7 +120,6 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
 extern struct inode *ext2_iget (struct super_block *, unsigned long);
 extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_evict_inode(struct inode *);
-extern int ext2_sync_inode (struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 940c96168868..40ad210a5049 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -458,7 +458,7 @@ failed_out:
 *      the same format as ext2_get_branch() would do. We are calling it after
 *      we had read the existing part of chain and partial points to the last
 *      triple of that (one with zero ->key). Upon the exit we have the same
- *      picture as after the successful ext2_get_block(), excpet that in one
+ *      picture as after the successful ext2_get_block(), except that in one
 *      place chain is disconnected - *branch->p is still zero (we did not
 *      set the last link), but branch->key contains the number that should
 *      be placed into *branch->p to fill that gap.
@@ -662,7 +662,7 @@ static int ext2_get_blocks(struct inode *inode,
        mutex_lock(&ei->truncate_mutex);
        /*
         * If the indirect block is missing while we are reading
-         * the chain(ext3_get_branch() returns -EAGAIN err), or
+         * the chain(ext2_get_branch() returns -EAGAIN err), or
         * if the chain has been changed after we grab the semaphore,
         * (either because another process truncated this branch, or
         * another get_block allocated this branch) re-grab the chain to see if
@@ -1203,7 +1203,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
        if (inode_needs_sync(inode)) {
                sync_mapping_buffers(inode->i_mapping);
-                ext2_sync_inode (inode);
+                sync_inode_metadata(inode, 1);
        } else {
                mark_inode_dirty(inode);
        }
@@ -1523,15 +1523,6 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
        return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
-int ext2_sync_inode(struct inode *inode)
-{
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 0,       /* sys_fsync did this */
-        };
-        return sync_inode(inode, &wbc);
-}
 int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
        struct inode *inode = dentry->d_inode;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 71efb0e9a3f2..f8aecd2e3297 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -206,7 +206,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = ext2_add_link(dentry, inode);
        if (!err) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1ec602673ea8..d89e0b6a2d78 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -747,15 +747,16 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        __le32 features;
        int err;
+        err = -ENOMEM;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
-                return -ENOMEM;
+                goto failed_unlock;
        sbi->s_blockgroup_lock =
                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
        if (!sbi->s_blockgroup_lock) {
                kfree(sbi);
-                return -ENOMEM;
+                goto failed_unlock;
        }
        sb->s_fs_info = sbi;
        sbi->s_sb_block = sb_block;
@@ -1107,6 +1108,7 @@ failed_sbi:
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
+failed_unlock:
        return ret;
 }
@@ -1219,9 +1221,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        }
        es = sbi->s_es;
-        if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
+        if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
-            (old_mount_opt & EXT2_MOUNT_XIP)) &&
-            invalidate_inodes(sb)) {
                ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
                         "xip flag with busy inodes while remounting");
                sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
@@ -1356,10 +1356,10 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
        return 0;
 }
-static int ext2_get_sb(struct file_system_type *fs_type,
+static struct dentry *ext2_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
 }
 #ifdef CONFIG_QUOTA
@@ -1473,7 +1473,7 @@ out:
 static struct file_system_type ext2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext2",
-        .get_sb         = ext2_get_sb,
+        .mount          = ext2_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 8c29ae15129e..f84700be3274 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -699,7 +699,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
        EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
        inode->i_ctime = CURRENT_TIME_SEC;
        if (IS_SYNC(inode)) {
-                error = ext2_sync_inode (inode);
+                error = sync_inode_metadata(inode, 1);
                /* In case sync failed due to ENOSPC the inode was actually
                 * written (only some dirty data were not) so we just proceed
                 * as if nothing happened and cleanup the unused block */
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 4a32511f4ded..b3db22649426 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -792,9 +792,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
        if (here < 0)
                here = 0;
-        p = ((char *)bh->b_data) + (here >> 3);
+        p = bh->b_data + (here >> 3);
        r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
-        next = (r - ((char *)bh->b_data)) << 3;
+        next = (r - bh->b_data) << 3;
        if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
                return next;
@@ -810,8 +810,9 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 /**
 * claim_block()
+ * @lock:               the spin lock for this block group
 * @block:              the free block (group relative) to allocate
- * @bh:                 the bufferhead containts the block group bitmap
+ * @bh:                 the buffer_head contains the block group bitmap
 *
 * We think we can allocate this block in this bitmap.  Try to set the bit.
 * If that succeeds then check that nobody has allocated and then freed the
@@ -956,9 +957,11 @@ fail_access:
 *              but we will shift to the place where start_block is,
 *              then start from there, when looking for a reservable space.
 *
- *      @size: the target new reservation window size
+ *      @my_rsv: the reservation window
 *
- *      @group_first_block: the first block we consider to start
+ *      @sb: the super block
+ *
+ *      @start_block: the first block we consider to start
 *                      the real search from
 *
 *      @last_block:
@@ -1084,7 +1087,7 @@ static int find_next_reservable_window(
 *
 *      failed: we failed to find a reservation window in this group
 *
- *      @rsv: the reservation
+ *      @my_rsv: the reservation window
 *
 *      @grp_goal: The goal (group-relative).  It is where the search for a
 *              free reservable space should start from.
@@ -1273,8 +1276,8 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
 * @group:              given allocation block group
 * @bitmap_bh:          bufferhead holds the block bitmap
 * @grp_goal:           given target block within the group
- * @count:              target number of blocks to allocate
 * @my_rsv:             reservation window
+ * @count:              target number of blocks to allocate
 * @errp:               pointer to store the error code
 *
 * This is the main function used to allocate a new block and its reservation
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d7e9f74dc3a6..09b13bb34c94 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,7 +90,6 @@ int ext3_sync_file(struct file *file, int datasync)
         * storage
         */
        if (needs_barrier)
-                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-                                BLKDEV_IFL_WAIT);
        return ret;
 }
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 4ab72db3559e..9724aef22460 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -570,9 +570,14 @@ got:
        ei->i_state_flags = 0;
        ext3_set_inode_state(inode, EXT3_STATE_NEW);
-        ei->i_extra_isize =
+        /* See comment in ext3_iget for explanation */
-                (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
+        if (ino >= EXT3_FIRST_INO(sb) + 1 &&
-                sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
+            EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
+                ei->i_extra_isize =
+                        sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
+        } else {
+                ei->i_extra_isize = 0;
+        }
        ret = inode;
        dquot_initialize(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5e0faf4cda79..a9580617edd2 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -498,7 +498,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
 }
 /**
- *      ext3_blks_to_allocate: Look up the block map and count the number
+ *      ext3_blks_to_allocate - Look up the block map and count the number
 *      of direct blocks need to be allocated for the given branch.
 *
 *      @branch: chain of indirect blocks
@@ -536,14 +536,18 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 }
 /**
- *      ext3_alloc_blocks: multiple allocate blocks needed for a branch
+ *      ext3_alloc_blocks - multiple allocate blocks needed for a branch
+ *      @handle: handle for this transaction
+ *      @inode: owner
+ *      @goal: preferred place for allocation
 *      @indirect_blks: the number of blocks need to allocate for indirect
 *                      blocks
- *
+ *      @blks:  number of blocks need to allocated for direct blocks
 *      @new_blocks: on return it will store the new block numbers for
 *      the indirect blocks(if needed) and the first direct block,
- *      @blks:  on return it will store the total number of allocated
+ *      @err: here we store the error value
- *              direct blocks
+ *
+ *      return the number of direct blocks allocated
 */
 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
                        ext3_fsblk_t goal, int indirect_blks, int blks,
@@ -598,9 +602,11 @@ failed_out:
 /**
 *      ext3_alloc_branch - allocate and set up a chain of blocks.
+ *      @handle: handle for this transaction
 *      @inode: owner
 *      @indirect_blks: number of allocated indirect blocks
 *      @blks: number of allocated direct blocks
+ *      @goal: preferred place for allocation
 *      @offsets: offsets (in the blocks) to store the pointers to next.
 *      @branch: place to store the chain in.
 *
@@ -700,10 +706,9 @@ failed:
 /**
 * ext3_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
 * @inode: owner
 * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *      ext3_alloc_branch)
 * @where: location of missing link
 * @num:   number of indirect blocks we are adding
 * @blks:  number of direct blocks we are adding
@@ -1696,8 +1701,8 @@ static int ext3_journalled_writepage(struct page *page,
                 * doesn't seem much point in redirtying the page here.
                 */
                ClearPageChecked(page);
-                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
-                                        ext3_get_block);
+                                          ext3_get_block);
                if (ret != 0) {
                        ext3_journal_stop(handle);
                        goto out_unlock;
@@ -2530,7 +2535,6 @@ void ext3_truncate(struct inode *inode)
                         */
                } else {
                        /* Shared branch grows from an indirect block */
-                        BUFFER_TRACE(partial->bh, "get_write_access");
                        ext3_free_branches(handle, inode, partial->bh,
                                        partial->p,
                                        partial->p+1, (chain+n-1) - partial);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 2b35ddb70d65..bce9dce639b8 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2260,7 +2260,7 @@ retry:
        inode->i_ctime = CURRENT_TIME_SEC;
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = ext3_add_entry(handle, dentry, inode);
        if (!err) {
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 0ccd7b12b73c..e746d30b1232 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -977,7 +977,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        o_blocks_count = le32_to_cpu(es->s_blocks_count);
        if (test_opt(sb, DEBUG))
-                printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
+                printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
+                       " upto "E3FSBLK" blocks\n",
                       o_blocks_count, n_blocks_count);
        if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -985,7 +986,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
                printk(KERN_ERR "EXT3-fs: filesystem on %s:"
-                        " too large to resize to %lu blocks safely\n",
+                        " too large to resize to "E3FSBLK" blocks safely\n",
                        sb->s_id, n_blocks_count);
                if (sizeof(sector_t) < 8)
                        ext3_warning(sb, __func__,
@@ -1065,11 +1066,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
        mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
-        ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
+        ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
-                   o_blocks_count + add);
+                   o_blocks_count, o_blocks_count + add);
        ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-        ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count,
+        ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
-                   o_blocks_count + add);
+                   o_blocks_count, o_blocks_count + add);
        if ((err = ext3_journal_stop(handle)))
                goto exit_put;
        if (test_opt(sb, DEBUG))
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a367dd044280..2fedaf8b5012 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -411,9 +411,6 @@ static void ext3_put_super (struct super_block * sb)
        int i, err;
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
-        lock_kernel();
        ext3_xattr_put_super(sb);
        err = journal_destroy(sbi->s_journal);
        sbi->s_journal = NULL;
@@ -462,8 +459,6 @@ static void ext3_put_super (struct super_block * sb)
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        unlock_kernel();
 }
 static struct kmem_cache *ext3_inode_cachep;
@@ -1306,9 +1301,9 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
                ext3_msg(sb, KERN_WARNING,
                        "warning: mounting fs with errors, "
                        "running e2fsck is recommended");
-        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
-                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+                        le16_to_cpu(es->s_max_mnt_count))
                ext3_msg(sb, KERN_WARNING,
                        "warning: maximal mount count reached, "
                        "running e2fsck is recommended");
@@ -1325,7 +1320,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
                   valid forever! :) */
        es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
 #endif
-        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+        if (!le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
        es->s_mtime = cpu_to_le32(get_seconds());
@@ -1627,8 +1622,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sbi->s_resgid = EXT3_DEF_RESGID;
        sbi->s_sb_block = sb_block;
-        unlock_kernel();
        blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
        if (!blocksize) {
                ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
@@ -1654,7 +1647,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
         * Note: s_es must be initialized as soon as possible because
         *       some ext3 macro-instructions depend on its value
         */
-        es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+        es = (struct ext3_super_block *) (bh->b_data + offset);
        sbi->s_es = es;
        sb->s_magic = le16_to_cpu(es->s_magic);
        if (sb->s_magic != EXT3_SUPER_MAGIC)
@@ -1765,7 +1758,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                               "error: can't read superblock on 2nd try");
                        goto failed_mount;
                }
-                es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
+                es = (struct ext3_super_block *)(bh->b_data + offset);
                sbi->s_es = es;
                if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
                        ext3_msg(sb, KERN_ERR,
@@ -1864,13 +1857,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
                               le32_to_cpu(es->s_first_data_block) - 1)
                                       / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
-        db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
+        db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
-                   EXT3_DESC_PER_BLOCK(sb);
        sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
                                    GFP_KERNEL);
        if (sbi->s_group_desc == NULL) {
                ext3_msg(sb, KERN_ERR,
                        "error: not enough memory");
+                ret = -ENOMEM;
                goto failed_mount;
        }
@@ -1958,6 +1951,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        }
        if (err) {
                ext3_msg(sb, KERN_ERR, "error: insufficient memory");
+                ret = err;
                goto failed_mount3;
        }
@@ -2025,7 +2019,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
                "writeback");
-        lock_kernel();
        return 0;
 cantfind_ext3:
@@ -2055,7 +2048,6 @@ out_fail:
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        lock_kernel();
        return ret;
 }
@@ -2168,7 +2160,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
                goto out_bdev;
        }
-        es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+        es = (struct ext3_super_block *) (bh->b_data + offset);
        if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
            !(le32_to_cpu(es->s_feature_incompat) &
              EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
@@ -2361,6 +2353,21 @@ static int ext3_commit_super(struct super_block *sb,
        if (!sbh)
                return error;
+        if (buffer_write_io_error(sbh)) {
+                /*
+                 * Oh, dear.  A previous attempt to write the
+                 * superblock failed.  This could happen because the
+                 * USB device was yanked out.  Or it could happen to
+                 * be a transient write error and maybe the block will
+                 * be remapped.  Nothing we can do but to retry the
+                 * write and hope for the best.
+                 */
+                ext3_msg(sb, KERN_ERR, "previous I/O error to "
+                       "superblock detected");
+                clear_buffer_write_io_error(sbh);
+                set_buffer_uptodate(sbh);
+        }
        /*
         * If the file system is mounted read-only, don't update the
         * superblock write time.  This avoids updating the superblock
@@ -2377,8 +2384,15 @@ static int ext3_commit_super(struct super_block *sb,
        es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
-        if (sync)
+        if (sync) {
                error = sync_dirty_buffer(sbh);
+                if (buffer_write_io_error(sbh)) {
+                        ext3_msg(sb, KERN_ERR, "I/O error while writing "
+                               "superblock");
+                        clear_buffer_write_io_error(sbh);
+                        set_buffer_uptodate(sbh);
+                }
+        }
        return error;
 }
@@ -2538,8 +2552,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        int i;
 #endif
-        lock_kernel();
        /* Store the original options */
        lock_super(sb);
        old_sb_flags = sb->s_flags;
@@ -2648,7 +2660,6 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                        kfree(old_opts.s_qf_names[i]);
 #endif
        unlock_super(sb);
-        unlock_kernel();
        if (enable_quota)
                dquot_resume(sb, -1);
@@ -2669,7 +2680,6 @@ restore_opts:
        }
 #endif
        unlock_super(sb);
-        unlock_kernel();
        return err;
 }
@@ -3010,16 +3020,16 @@ out:
 #endif
-static int ext3_get_sb(struct file_system_type *fs_type,
+static struct dentry *ext3_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
 }
 static struct file_system_type ext3_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext3",
-        .get_sb         = ext3_get_sb,
+        .mount          = ext3_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8867b2a1e5fe..c947e36eda6c 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_EXT4_FS) += ext4.o
-ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd30799a43ed..14c3af26c671 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -171,7 +171,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 * less than the blocksize * 8 ( which is the size
                 * of bitmap ), set rest of the block bitmap to 1
                 */
-                mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
+                ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
+                                     bh->b_data);
        }
        return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
@@ -489,7 +490,7 @@ error_return:
 * Check if filesystem has nblocks free & available for allocation.
 * On success return 1, return 0 on failure.
 */
-int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
        s64 free_blocks, dirty_blocks, root_blocks;
        struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3db5084db9bd..fac90f3fba80 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -29,16 +29,15 @@ struct ext4_system_zone {
 static struct kmem_cache *ext4_system_zone_cachep;
-int __init init_ext4_system_zone(void)
+int __init ext4_init_system_zone(void)
 {
-        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
+        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
-                                             SLAB_RECLAIM_ACCOUNT);
        if (ext4_system_zone_cachep == NULL)
                return -ENOMEM;
        return 0;
 }
-void exit_ext4_system_zone(void)
+void ext4_exit_system_zone(void)
 {
        kmem_cache_destroy(ext4_system_zone_cachep);
 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 374510f72baa..ece76fb6a40c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -39,7 +39,7 @@ static int ext4_release_dir(struct inode *inode,
                                struct file *filp);
 const struct file_operations ext4_dir_operations = {
-        .llseek         = generic_file_llseek,
+        .llseek         = ext4_llseek,
        .read           = generic_read_dir,
        .readdir        = ext4_readdir,         /* we take BKL. needed?*/
        .unlocked_ioctl = ext4_ioctl,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 889ec9d5e6ad..6a5edea2d70b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -168,7 +168,20 @@ struct mpage_da_data {
        int pages_written;
        int retval;
 };
-#define EXT4_IO_UNWRITTEN       0x1
+/*
+ * Flags for ext4_io_end->flags
+ */
+#define EXT4_IO_END_UNWRITTEN   0x0001
+#define EXT4_IO_END_ERROR       0x0002
+struct ext4_io_page {
+        struct page     *p_page;
+        atomic_t        p_count;
+};
+#define MAX_IO_PAGES 128
 typedef struct ext4_io_end {
        struct list_head        list;           /* per-file finished IO list */
        struct inode            *inode;         /* file being written to */
@@ -179,8 +192,18 @@ typedef struct ext4_io_end {
        struct work_struct      work;           /* data work queue */
        struct kiocb            *iocb;          /* iocb struct for AIO */
        int                     result;         /* error value for AIO */
+        int                     num_io_pages;
+        struct ext4_io_page     *pages[MAX_IO_PAGES];
 } ext4_io_end_t;
+struct ext4_io_submit {
+        int                     io_op;
+        struct bio              *io_bio;
+        ext4_io_end_t           *io_end;
+        struct ext4_io_page     *io_page;
+        sector_t                io_next_block;
+};
 /*
 * Special inodes numbers
 */
@@ -205,6 +228,7 @@ typedef struct ext4_io_end {
 #define EXT4_MIN_BLOCK_SIZE             1024
 #define EXT4_MAX_BLOCK_SIZE             65536
 #define EXT4_MIN_BLOCK_LOG_SIZE         10
+#define EXT4_MAX_BLOCK_LOG_SIZE         16
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE(s)             ((s)->s_blocksize)
 #else
@@ -834,6 +858,7 @@ struct ext4_inode_info {
        spinlock_t i_completed_io_lock;
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
+        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        /*
         * Transactions that contain inode's metadata needed to complete
@@ -889,6 +914,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
 #define EXT4_MOUNT_DISCARD              0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INIT_INODE_TABLE     0x80000000 /* Initialize uninitialized itables */
 #define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)                 o |= EXT4_MOUNT_##opt
@@ -1087,7 +1113,6 @@ struct ext4_sb_info {
        struct completion s_kobj_unregister;
        /* Journaling */
-        struct inode *s_journal_inode;
        struct journal_s *s_journal;
        struct list_head s_orphan;
        struct mutex s_orphan_lock;
@@ -1120,10 +1145,7 @@ struct ext4_sb_info {
        /* for buddy allocator */
        struct ext4_group_info ***s_group_info;
        struct inode *s_buddy_cache;
-        long s_blocks_reserved;
-        spinlock_t s_reserve_lock;
        spinlock_t s_md_lock;
-        tid_t s_last_transaction;
        unsigned short *s_mb_offsets;
        unsigned int *s_mb_maxs;
@@ -1141,7 +1163,6 @@ struct ext4_sb_info {
        unsigned long s_mb_last_start;
        /* stats for buddy allocator */
-        spinlock_t s_mb_pa_lock;
        atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
        atomic_t s_bal_success; /* we found long enough chunks */
        atomic_t s_bal_allocated;       /* in blocks */
@@ -1172,6 +1193,11 @@ struct ext4_sb_info {
        /* timer for periodic error stats printing */
        struct timer_list s_err_report;
+        /* Lazy inode table initialization info */
+        struct ext4_li_request *s_li_request;
+        /* Wait multiplier for lazy initialization thread */
+        unsigned int s_li_wait_mult;
 };
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1533,7 +1559,42 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
                        ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
-extern struct proc_dir_entry *ext4_proc_root;
+/*
+ * Timeout and state flag for lazy initialization inode thread.
+ */
+#define EXT4_DEF_LI_WAIT_MULT                   10
+#define EXT4_DEF_LI_MAX_START_DELAY             5
+#define EXT4_LAZYINIT_QUIT                      0x0001
+#define EXT4_LAZYINIT_RUNNING                   0x0002
+/*
+ * Lazy inode table initialization info
+ */
+struct ext4_lazy_init {
+        unsigned long           li_state;
+        wait_queue_head_t       li_wait_daemon;
+        wait_queue_head_t       li_wait_task;
+        struct timer_list       li_timer;
+        struct task_struct      *li_task;
+        struct list_head        li_request_list;
+        struct mutex            li_list_mtx;
+};
+struct ext4_li_request {
+        struct super_block      *lr_super;
+        struct ext4_sb_info     *lr_sbi;
+        ext4_group_t            lr_next_group;
+        struct list_head        lr_request;
+        unsigned long           lr_next_sched;
+        unsigned long           lr_timeout;
+};
+struct ext4_features {
+        struct kobject f_kobj;
+        struct completion f_kobj_unregister;
+};
 /*
 * Function prototypes
@@ -1561,7 +1622,6 @@ extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1605,11 +1665,9 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
+extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-                                       struct buffer_head *bh,
+extern int ext4_init_inode_table(struct super_block *sb,
-                                       ext4_group_t group,
+                                 ext4_group_t group, int barrier);
-                                       struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1620,16 +1678,15 @@ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
 extern void ext4_discard_preallocations(struct inode *);
-extern int __init init_ext4_mballoc(void);
+extern int __init ext4_init_mballoc(void);
-extern void exit_ext4_mballoc(void);
+extern void ext4_exit_mballoc(void);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh, ext4_fsblk_t block,
                             unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
-extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
-extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
-                                                ext4_group_t, int);
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
@@ -1657,13 +1714,11 @@ extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
-extern int flush_completed_IO(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
 /* ioctl.c */
@@ -1960,6 +2015,7 @@ extern const struct file_operations ext4_dir_operations;
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
+extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
@@ -1973,8 +2029,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
 /* block_validity */
 extern void ext4_release_system_zone(struct super_block *sb);
 extern int ext4_setup_system_zone(struct super_block *sb);
-extern int __init init_ext4_system_zone(void);
+extern int __init ext4_init_system_zone(void);
-extern void exit_ext4_system_zone(void);
+extern void ext4_exit_system_zone(void);
 extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
                                 ext4_fsblk_t start_blk,
                                 unsigned int count);
@@ -2002,6 +2058,18 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);
+/* page-io.c */
+extern int __init ext4_init_pageio(void);
+extern void ext4_exit_pageio(void);
+extern void ext4_ioend_wait(struct inode *);
+extern void ext4_free_io_end(ext4_io_end_t *io);
+extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern int ext4_end_io_nolock(ext4_io_end_t *io);
+extern void ext4_io_submit(struct ext4_io_submit *io);
+extern int ext4_bio_write_page(struct ext4_io_submit *io,
+                               struct page *page,
+                               int len,
+                               struct writeback_control *wbc);
 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
 enum ext4_state_bits {
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bdb6ce7e2eb4..28ce70fd9cd0 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -225,11 +225,60 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
        ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
 }
+/*
+ * ext4_ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
+{
+        ext4_fsblk_t block;
+        block = le32_to_cpu(ex->ee_start_lo);
+        block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+        return block;
+}
+/*
+ * ext4_idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
+{
+        ext4_fsblk_t block;
+        block = le32_to_cpu(ix->ei_leaf_lo);
+        block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+        return block;
+}
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
+                                         ext4_fsblk_t pb)
+{
+        ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+        ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+                                      0xffff);
+}
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
+                                         ext4_fsblk_t pb)
+{
+        ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+                                     0xffff);
+}
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
                                         sector_t lblocks);
-extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
-extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
-extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
@@ -237,19 +286,9 @@ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
 extern int ext4_can_extents_be_merged(struct inode *inode,
                                      struct ext4_extent *ex1,
                                      struct ext4_extent *ex2);
-extern int ext4_ext_try_to_merge(struct inode *inode,
-                                 struct ext4_ext_path *path,
-                                 struct ext4_extent *);
-extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
-extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
-                                                        ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
                                                        struct ext4_ext_path *);
-extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
-                                                ext4_lblk_t *, ext4_fsblk_t *);
-extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
-                                                ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 06328d3e5717..0554c48cb1fd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,55 +44,6 @@
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
-/*
- * ext_pblock:
- * combine low and high parts of physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
-{
-        ext4_fsblk_t block;
-        block = le32_to_cpu(ex->ee_start_lo);
-        block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
-        return block;
-}
-/*
- * idx_pblock:
- * combine low and high parts of a leaf physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
-{
-        ext4_fsblk_t block;
-        block = le32_to_cpu(ix->ei_leaf_lo);
-        block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
-        return block;
-}
-/*
- * ext4_ext_store_pblock:
- * stores a large physical block number into an extent struct,
- * breaking it into parts
- */
-void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
-{
-        ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-        ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
-/*
- * ext4_idx_store_pblock:
- * stores a large physical block number into an index struct,
- * breaking it into parts
- */
-static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
-{
-        ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
                                            int needed)
@@ -169,7 +120,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                /* try to predict block placement */
                ex = path[depth].p_ext;
                if (ex)
-                        return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+                        return (ext4_ext_pblock(ex) +
+                                (block - le32_to_cpu(ex->ee_block)));
                /* it looks like index is empty;
                 * try to find starting block from index itself */
@@ -354,7 +306,7 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-        ext4_fsblk_t block = ext_pblock(ext);
+        ext4_fsblk_t block = ext4_ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
@@ -363,7 +315,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
 {
-        ext4_fsblk_t block = idx_pblock(ext_idx);
+        ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 }
@@ -463,13 +415,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
        for (k = 0; k <= l; k++, path++) {
                if (path->p_idx) {
                  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
-                            idx_pblock(path->p_idx));
+                            ext4_idx_pblock(path->p_idx));
                } else if (path->p_ext) {
                        ext_debug("  %d:[%d]%d:%llu ",
                                  le32_to_cpu(path->p_ext->ee_block),
                                  ext4_ext_is_uninitialized(path->p_ext),
                                  ext4_ext_get_actual_len(path->p_ext),
-                                  ext_pblock(path->p_ext));
+                                  ext4_ext_pblock(path->p_ext));
                } else
                        ext_debug("  []");
        }
@@ -494,7 +446,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
                ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
                          ext4_ext_is_uninitialized(ex),
-                          ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                          ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
        }
        ext_debug("\n");
 }
@@ -545,7 +497,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
        path->p_idx = l - 1;
        ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
-                  idx_pblock(path->p_idx));
+                  ext4_idx_pblock(path->p_idx));
 #ifdef CHECK_BINSEARCH
        {
@@ -614,7 +566,7 @@ ext4_ext_binsearch(struct inode *inode,
        path->p_ext = l - 1;
        ext_debug("  -> %d:%llu:[%d]%d ",
                        le32_to_cpu(path->p_ext->ee_block),
-                        ext_pblock(path->p_ext),
+                        ext4_ext_pblock(path->p_ext),
                        ext4_ext_is_uninitialized(path->p_ext),
                        ext4_ext_get_actual_len(path->p_ext));
@@ -682,7 +634,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
                ext4_ext_binsearch_idx(inode, path + ppos, block);
-                path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+                path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                path[ppos].p_depth = i;
                path[ppos].p_ext = NULL;
@@ -721,7 +673,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        ext4_ext_binsearch(inode, path + ppos, block);
        /* if not an empty leaf */
        if (path[ppos].p_ext)
-                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
        ext4_ext_show_path(inode, path);
@@ -739,9 +691,9 @@ err:
 * insert new index [@logical;@ptr] into the block at @curp;
 * check where to insert: before @curp or after @curp
 */
-int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
-                                struct ext4_ext_path *curp,
+                                 struct ext4_ext_path *curp,
-                                int logical, ext4_fsblk_t ptr)
+                                 int logical, ext4_fsblk_t ptr)
 {
        struct ext4_extent_idx *ix;
        int len, err;
@@ -917,7 +869,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                        EXT_MAX_EXTENT(path[depth].p_hdr)) {
                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
                                le32_to_cpu(path[depth].p_ext->ee_block),
-                                ext_pblock(path[depth].p_ext),
+                                ext4_ext_pblock(path[depth].p_ext),
                                ext4_ext_is_uninitialized(path[depth].p_ext),
                                ext4_ext_get_actual_len(path[depth].p_ext),
                                newblock);
@@ -1007,7 +959,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
                        ext_debug("%d: move %d:%llu in new index %llu\n", i,
                                        le32_to_cpu(path[i].p_idx->ei_block),
-                                        idx_pblock(path[i].p_idx),
+                                        ext4_idx_pblock(path[i].p_idx),
                                        newblock);
                        /*memmove(++fidx, path[i].p_idx++,
                                        sizeof(struct ext4_extent_idx));
@@ -1146,7 +1098,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
-                  idx_pblock(EXT_FIRST_INDEX(neh)));
+                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
        neh->eh_depth = cpu_to_le16(path->p_depth + 1);
        err = ext4_ext_dirty(handle, inode, curp);
@@ -1232,9 +1184,9 @@ out:
 * returns 0 at @phys
 * return value contains 0 (success) or error code
 */
-int
+static int ext4_ext_search_left(struct inode *inode,
-ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
+                                struct ext4_ext_path *path,
-                        ext4_lblk_t *logical, ext4_fsblk_t *phys)
+                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
        struct ext4_extent_idx *ix;
        struct ext4_extent *ex;
@@ -1286,7 +1238,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
        }
        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
-        *phys = ext_pblock(ex) + ee_len - 1;
+        *phys = ext4_ext_pblock(ex) + ee_len - 1;
        return 0;
 }
@@ -1297,9 +1249,9 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
 * returns 0 at @phys
 * return value contains 0 (success) or error code
 */
-int
+static int ext4_ext_search_right(struct inode *inode,
-ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
+                                 struct ext4_ext_path *path,
-                        ext4_lblk_t *logical, ext4_fsblk_t *phys)
+                                 ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
        struct buffer_head *bh = NULL;
        struct ext4_extent_header *eh;
@@ -1342,7 +1294,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
                        }
                }
                *logical = le32_to_cpu(ex->ee_block);
-                *phys = ext_pblock(ex);
+                *phys = ext4_ext_pblock(ex);
                return 0;
        }
@@ -1357,7 +1309,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
                /* next allocated block in this leaf */
                ex++;
                *logical = le32_to_cpu(ex->ee_block);
-                *phys = ext_pblock(ex);
+                *phys = ext4_ext_pblock(ex);
                return 0;
        }
@@ -1376,7 +1328,7 @@ got_index:
         * follow it and find the closest allocated
         * block to the right */
        ix++;
-        block = idx_pblock(ix);
+        block = ext4_idx_pblock(ix);
        while (++depth < path->p_depth) {
                bh = sb_bread(inode->i_sb, block);
                if (bh == NULL)
@@ -1388,7 +1340,7 @@ got_index:
                        return -EIO;
                }
                ix = EXT_FIRST_INDEX(eh);
-                block = idx_pblock(ix);
+                block = ext4_idx_pblock(ix);
                put_bh(bh);
        }
@@ -1402,7 +1354,7 @@ got_index:
        }
        ex = EXT_FIRST_EXTENT(eh);
        *logical = le32_to_cpu(ex->ee_block);
-        *phys = ext_pblock(ex);
+        *phys = ext4_ext_pblock(ex);
        put_bh(bh);
        return 0;
 }
@@ -1573,7 +1525,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
                return 0;
 #endif
-        if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
+        if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
                return 1;
        return 0;
 }
@@ -1585,9 +1537,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
 * 1 if they got merged.
 */
-int ext4_ext_try_to_merge(struct inode *inode,
+static int ext4_ext_try_to_merge(struct inode *inode,
-                          struct ext4_ext_path *path,
+                                 struct ext4_ext_path *path,
-                          struct ext4_extent *ex)
+                                 struct ext4_extent *ex)
 {
        struct ext4_extent_header *eh;
        unsigned int depth, len;
@@ -1632,9 +1584,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
 * such that there will be no overlap, and then returns 1.
 * If there is no overlap found, it returns 0.
 */
-unsigned int ext4_ext_check_overlap(struct inode *inode,
+static unsigned int ext4_ext_check_overlap(struct inode *inode,
-                                    struct ext4_extent *newext,
+                                           struct ext4_extent *newext,
-                                    struct ext4_ext_path *path)
+                                           struct ext4_ext_path *path)
 {
        ext4_lblk_t b1, b2;
        unsigned int depth, len1;
@@ -1706,11 +1658,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
                && ext4_can_extents_be_merged(inode, ex, newext)) {
                ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
-                                ext4_ext_is_uninitialized(newext),
+                          ext4_ext_is_uninitialized(newext),
-                                ext4_ext_get_actual_len(newext),
+                          ext4_ext_get_actual_len(newext),
-                                le32_to_cpu(ex->ee_block),
+                          le32_to_cpu(ex->ee_block),
-                                ext4_ext_is_uninitialized(ex),
+                          ext4_ext_is_uninitialized(ex),
-                                ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                          ext4_ext_get_actual_len(ex),
+                          ext4_ext_pblock(ex));
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        return err;
@@ -1780,7 +1733,7 @@ has_space:
                /* there is no extent in this leaf, create first one */
                ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
-                                ext_pblock(newext),
+                                ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext));
                path[depth].p_ext = EXT_FIRST_EXTENT(eh);
@@ -1794,7 +1747,7 @@ has_space:
                        ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
                                        "move %d from 0x%p to 0x%p\n",
                                        le32_to_cpu(newext->ee_block),
-                                        ext_pblock(newext),
+                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_uninitialized(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex, len, nearex + 1, nearex + 2);
@@ -1808,7 +1761,7 @@ has_space:
                ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
                                "move %d from 0x%p to 0x%p\n",
                                le32_to_cpu(newext->ee_block),
-                                ext_pblock(newext),
+                                ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext),
                                nearex, len, nearex + 1, nearex + 2);
@@ -1819,7 +1772,7 @@ has_space:
        le16_add_cpu(&eh->eh_entries, 1);
        nearex = path[depth].p_ext;
        nearex->ee_block = newext->ee_block;
-        ext4_ext_store_pblock(nearex, ext_pblock(newext));
+        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
        nearex->ee_len = newext->ee_len;
 merge:
@@ -1845,9 +1798,9 @@ cleanup:
        return err;
 }
-int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
-                        ext4_lblk_t num, ext_prepare_callback func,
+                               ext4_lblk_t num, ext_prepare_callback func,
-                        void *cbdata)
+                               void *cbdata)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_ext_cache cbex;
@@ -1923,7 +1876,7 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                } else {
                        cbex.ec_block = le32_to_cpu(ex->ee_block);
                        cbex.ec_len = ext4_ext_get_actual_len(ex);
-                        cbex.ec_start = ext_pblock(ex);
+                        cbex.ec_start = ext4_ext_pblock(ex);
                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
                }
@@ -2073,7 +2026,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        /* free index block */
        path--;
-        leaf = idx_pblock(path->p_idx);
+        leaf = ext4_idx_pblock(path->p_idx);
        if (unlikely(path->p_hdr->eh_entries == 0)) {
                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
                return -EIO;
@@ -2181,7 +2134,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t start;
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
-                start = ext_pblock(ex) + ee_len - num;
+                start = ext4_ext_pblock(ex) + ee_len - num;
                ext_debug("free last %u blocks starting %llu\n", num, start);
                ext4_free_blocks(handle, inode, 0, start, num, flags);
        } else if (from == le32_to_cpu(ex->ee_block)
@@ -2310,7 +2263,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        goto out;
                ext_debug("new extent: %u:%u:%llu\n", block, num,
-                                ext_pblock(ex));
+                                ext4_ext_pblock(ex));
                ex--;
                ex_ee_block = le32_to_cpu(ex->ee_block);
                ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2421,9 +2374,9 @@ again:
                        struct buffer_head *bh;
                        /* go to the next level */
                        ext_debug("move to level %d (block %llu)\n",
-                                  i + 1, idx_pblock(path[i].p_idx));
+                                  i + 1, ext4_idx_pblock(path[i].p_idx));
                        memset(path + i + 1, 0, sizeof(*path));
-                        bh = sb_bread(sb, idx_pblock(path[i].p_idx));
+                        bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
                        if (!bh) {
                                /* should we reset i_size? */
                                err = -EIO;
@@ -2535,77 +2488,21 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
-static void bi_complete(struct bio *bio, int error)
-{
-        complete((struct completion *)bio->bi_private);
-}
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
+        ext4_fsblk_t ee_pblock;
+        unsigned int ee_len;
        int ret;
-        struct bio *bio;
-        int blkbits, blocksize;
-        sector_t ee_pblock;
-        struct completion event;
-        unsigned int ee_len, len, done, offset;
-        blkbits   = inode->i_blkbits;
-        blocksize = inode->i_sb->s_blocksize;
        ee_len    = ext4_ext_get_actual_len(ex);
-        ee_pblock = ext_pblock(ex);
+        ee_pblock = ext4_ext_pblock(ex);
-        /* convert ee_pblock to 512 byte sectors */
-        ee_pblock = ee_pblock << (blkbits - 9);
-        while (ee_len > 0) {
-                if (ee_len > BIO_MAX_PAGES)
-                        len = BIO_MAX_PAGES;
-                else
-                        len = ee_len;
-                bio = bio_alloc(GFP_NOIO, len);
-                if (!bio)
-                        return -ENOMEM;
-                bio->bi_sector = ee_pblock;
-                bio->bi_bdev   = inode->i_sb->s_bdev;
-                done = 0;
-                offset = 0;
-                while (done < len) {
-                        ret = bio_add_page(bio, ZERO_PAGE(0),
-                                                        blocksize, offset);
-                        if (ret != blocksize) {
-                                /*
-                                 * We can't add any more pages because of
-                                 * hardware limitations.  Start a new bio.
-                                 */
-                                break;
-                        }
-                        done++;
-                        offset += blocksize;
-                        if (offset >= PAGE_CACHE_SIZE)
-                                offset = 0;
-                }
-                init_completion(&event);
+        ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
-                bio->bi_private = &event;
+        if (ret > 0)
-                bio->bi_end_io = bi_complete;
+                ret = 0;
-                submit_bio(WRITE, bio);
-                wait_for_completion(&event);
-                if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+        return ret;
-                        bio_put(bio);
-                        return -EIO;
-                }
-                bio_put(bio);
-                ee_len    -= done;
-                ee_pblock += done  << (blkbits - 9);
-        }
-        return 0;
 }
 #define EXT4_EXT_ZERO_LEN 7
@@ -2651,12 +2548,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = map->m_lblk - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
-        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+        ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
        /*
         * It is safe to convert extent to initialized via explicit
@@ -2675,7 +2572,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zeroed the full extent */
                return allocated;
@@ -2710,7 +2607,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = cpu_to_le16(ee_len - allocated);
                        ext4_ext_mark_uninitialized(ex);
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        ex3 = &newex;
@@ -2725,7 +2622,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                        goto fix_extent_len;
                                ex->ee_block = orig_ex.ee_block;
                                ex->ee_len   = orig_ex.ee_len;
-                                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                                ext4_ext_store_pblock(ex,
+                                        ext4_ext_pblock(&orig_ex));
                                ext4_ext_dirty(handle, inode, path + depth);
                                /* blocks available from map->m_lblk */
                                return allocated;
@@ -2782,7 +2680,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
                        /* blocks available from map->m_lblk */
@@ -2833,7 +2731,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zero out the first half */
                        /* blocks available from map->m_lblk */
@@ -2902,7 +2800,7 @@ insert:
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zero out the first half */
                return allocated;
@@ -2915,7 +2813,7 @@ out:
 fix_extent_len:
        ex->ee_block = orig_ex.ee_block;
        ex->ee_len   = orig_ex.ee_len;
-        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
        ext4_ext_mark_uninitialized(ex);
        ext4_ext_dirty(handle, inode, path + depth);
        return err;
@@ -2973,12 +2871,12 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = map->m_lblk - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
-        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+        ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
        /*
         * It is safe to convert extent to initialized via explicit
@@ -3027,7 +2925,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
                        /* blocks available from map->m_lblk */
@@ -3099,7 +2997,7 @@ insert:
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zero out the first half */
                return allocated;
@@ -3112,7 +3010,7 @@ out:
 fix_extent_len:
        ex->ee_block = orig_ex.ee_block;
        ex->ee_len   = orig_ex.ee_len;
-        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
        ext4_ext_mark_uninitialized(ex);
        ext4_ext_dirty(handle, inode, path + depth);
        return err;
@@ -3180,6 +3078,57 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
                unmap_underlying_metadata(bdev, block + i);
 }
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+                              struct ext4_map_blocks *map,
+                              struct ext4_ext_path *path,
+                              unsigned int len)
+{
+        int i, depth;
+        struct ext4_extent_header *eh;
+        struct ext4_extent *ex, *last_ex;
+        if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+                return 0;
+        depth = ext_depth(inode);
+        eh = path[depth].p_hdr;
+        ex = path[depth].p_ext;
+        if (unlikely(!eh->eh_entries)) {
+                EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+                                 "EOFBLOCKS_FL set");
+                return -EIO;
+        }
+        last_ex = EXT_LAST_EXTENT(eh);
+        /*
+         * We should clear the EOFBLOCKS_FL flag if we are writing the
+         * last block in the last extent in the file.  We test this by
+         * first checking to see if the caller to
+         * ext4_ext_get_blocks() was interested in the last block (or
+         * a block beyond the last block) in the current extent.  If
+         * this turns out to be false, we can bail out from this
+         * function immediately.
+         */
+        if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+            ext4_ext_get_actual_len(last_ex))
+                return 0;
+        /*
+         * If the caller does appear to be planning to write at or
+         * beyond the end of the current extent, we then test to see
+         * if the current extent is the last extent in the file, by
+         * checking to make sure it was reached via the rightmost node
+         * at each level of the tree.
+         */
+        for (i = depth-1; i >= 0; i--)
+                if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+                        return 0;
+        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+        return ext4_mark_inode_dirty(handle, inode);
+}
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
@@ -3206,7 +3155,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * completed
                 */
                if (io)
-                        io->flag = EXT4_IO_UNWRITTEN;
+                        io->flag = EXT4_IO_END_UNWRITTEN;
                else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
@@ -3217,8 +3166,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
                ret = ext4_convert_unwritten_extents_endio(handle, inode,
                                                        path);
-                if (ret >= 0)
+                if (ret >= 0) {
                        ext4_update_inode_fsync_trans(handle, inode, 1);
+                        err = check_eofblocks_fl(handle, inode, map, path,
+                                                 map->m_len);
+                } else
+                        err = ret;
                goto out2;
        }
        /* buffered IO case */
@@ -3244,8 +3197,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        /* buffered write, writepage time, convert*/
        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-        if (ret >= 0)
+        if (ret >= 0) {
                ext4_update_inode_fsync_trans(handle, inode, 1);
+                err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
+                if (err < 0)
+                        goto out2;
+        }
 out:
        if (ret <= 0) {
                err = ret;
@@ -3292,6 +3250,7 @@ out2:
        }
        return err ? err : allocated;
 }
 /*
 * Block allocation/map/preallocation routine for extents based files
 *
@@ -3315,9 +3274,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
-        struct ext4_extent newex, *ex, *last_ex;
+        struct ext4_extent newex, *ex;
        ext4_fsblk_t newblock;
-        int i, err = 0, depth, ret, cache_type;
+        int err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3341,7 +3300,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        /* block is already allocated */
                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
-                                   + ext_pblock(&newex);
+                                   + ext4_ext_pblock(&newex);
                        /* number of remaining blocks in the extent */
                        allocated = ext4_ext_get_actual_len(&newex) -
                                (map->m_lblk - le32_to_cpu(newex.ee_block));
@@ -3379,7 +3338,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        ex = path[depth].p_ext;
        if (ex) {
                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
-                ext4_fsblk_t ee_start = ext_pblock(ex);
+                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;
                /*
@@ -3488,7 +3447,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                        if (io)
-                                io->flag = EXT4_IO_UNWRITTEN;
+                                io->flag = EXT4_IO_END_UNWRITTEN;
                        else
                                ext4_set_inode_state(inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
@@ -3497,44 +3456,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
-        if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
+        err = check_eofblocks_fl(handle, inode, map, path, ar.len);
-                if (unlikely(!eh->eh_entries)) {
+        if (err)
-                        EXT4_ERROR_INODE(inode,
+                goto out2;
-                                         "eh->eh_entries == 0 and "
-                                         "EOFBLOCKS_FL set");
-                        err = -EIO;
-                        goto out2;
-                }
-                last_ex = EXT_LAST_EXTENT(eh);
-                /*
-                 * If the current leaf block was reached by looking at
-                 * the last index block all the way down the tree, and
-                 * we are extending the inode beyond the last extent
-                 * in the current leaf block, then clear the
-                 * EOFBLOCKS_FL flag.
-                 */
-                for (i = depth-1; i >= 0; i--) {
-                        if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
-                                break;
-                }
-                if ((i < 0) &&
-                    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
-                     ext4_ext_get_actual_len(last_ex)))
-                        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-        }
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
                /* free data blocks we just allocated */
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
-                ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
+                ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
                                 ext4_ext_get_actual_len(&newex), 0);
                goto out2;
        }
        /* previous routine could use block we allocated */
-        newblock = ext_pblock(&newex);
+        newblock = ext4_ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
        if (allocated > map->m_len)
                allocated = map->m_len;
@@ -3729,7 +3667,7 @@ retry:
                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
-                                    inode->i_ino, block, max_blocks);
+                                    inode->i_ino, map.m_lblk, max_blocks);
 #endif
                        ext4_mark_inode_dirty(handle, inode);
                        ret2 = ext4_journal_stop(handle);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ee92b66d4558..5a5c55ddceef 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -130,8 +130,50 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
        return dquot_file_open(inode, filp);
 }
+/*
+ * ext4_llseek() copied from generic_file_llseek() to handle both
+ * block-mapped and extent-mapped maxbytes values. This should
+ * otherwise be identical with generic_file_llseek().
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct inode *inode = file->f_mapping->host;
+        loff_t maxbytes;
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+                maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+        else
+                maxbytes = inode->i_sb->s_maxbytes;
+        mutex_lock(&inode->i_mutex);
+        switch (origin) {
+        case SEEK_END:
+                offset += inode->i_size;
+                break;
+        case SEEK_CUR:
+                if (offset == 0) {
+                        mutex_unlock(&inode->i_mutex);
+                        return file->f_pos;
+                }
+                offset += file->f_pos;
+                break;
+        }
+        if (offset < 0 || offset > maxbytes) {
+                mutex_unlock(&inode->i_mutex);
+                return -EINVAL;
+        }
+        if (offset != file->f_pos) {
+                file->f_pos = offset;
+                file->f_version = 0;
+        }
+        mutex_unlock(&inode->i_mutex);
+        return offset;
+}
 const struct file_operations ext4_file_operations = {
-        .llseek         = generic_file_llseek,
+        .llseek         = ext4_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546e..c1a7bc923cf6 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,6 +34,89 @@
 #include <trace/events/ext4.h>
+static void dump_completed_IO(struct inode * inode)
+{
+#ifdef  EXT4_DEBUG
+        struct list_head *cur, *before, *after;
+        ext4_io_end_t *io, *io0, *io1;
+        unsigned long flags;
+        if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+                ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
+                return;
+        }
+        ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+        list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
+                cur = &io->list;
+                before = cur->prev;
+                io0 = container_of(before, ext4_io_end_t, list);
+                after = cur->next;
+                io1 = container_of(after, ext4_io_end_t, list);
+                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+                            io, inode->i_ino, io0, io1);
+        }
+        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+#endif
+}
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
+ */
+static int flush_completed_IO(struct inode *inode)
+{
+        ext4_io_end_t *io;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        unsigned long flags;
+        int ret = 0;
+        int ret2 = 0;
+        if (list_empty(&ei->i_completed_io_list))
+                return ret;
+        dump_completed_IO(inode);
+        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        while (!list_empty(&ei->i_completed_io_list)){
+                io = list_entry(ei->i_completed_io_list.next,
+                                ext4_io_end_t, list);
+                /*
+                 * Calling ext4_end_io_nolock() to convert completed
+                 * IO to written.
+                 *
+                 * When ext4_sync_file() is called, run_queue() may already
+                 * about to flush the work corresponding to this io structure.
+                 * It will be upset if it founds the io structure related
+                 * to the work-to-be schedule is freed.
+                 *
+                 * Thus we need to keep the io structure still valid here after
+                 * convertion finished. The io structure has a flag to
+                 * avoid double converting from both fsync and background work
+                 * queue work.
+                 */
+                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+                ret = ext4_end_io_nolock(io);
+                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+                if (ret < 0)
+                        ret2 = ret;
+                else
+                        list_del_init(&io->list);
+        }
+        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        return (ret2 < 0) ? ret2 : 0;
+}
 /*
 * If we're not journaling and this is a just-created file, we have to
 * sync our parent directory (if it was freshly created) since
@@ -128,10 +211,9 @@ int ext4_sync_file(struct file *file, int datasync)
                    (journal->j_fs_dev != journal->j_dev) &&
                    (journal->j_flags & JBD2_BARRIER))
                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-                                        NULL, BLKDEV_IFL_WAIT);
+                                        NULL);
                ret = jbd2_log_wait_commit(journal, commit_tid);
        } else if (journal->j_flags & JBD2_BARRIER)
-                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-                        BLKDEV_IFL_WAIT);
        return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 45853e0d1f21..1ce240a23ebb 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -50,7 +50,7 @@
 * need to use it within a single byte (to ensure we get endianness right).
 * We can use memset for the rest of the bitmap as there are no other users.
 */
-void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 {
        int i;
@@ -65,9 +65,10 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 }
 /* Initializes an uninitialized inode bitmap */
-unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
+static unsigned ext4_init_inode_bitmap(struct super_block *sb,
-                                ext4_group_t block_group,
+                                       struct buffer_head *bh,
-                                struct ext4_group_desc *gdp)
+                                       ext4_group_t block_group,
+                                       struct ext4_group_desc *gdp)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -85,7 +86,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
        }
        memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                        bh->b_data);
        return EXT4_INODES_PER_GROUP(sb);
@@ -107,6 +108,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return NULL;
        bitmap_blk = ext4_inode_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
@@ -123,6 +125,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +136,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                return bh;
        }
        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -411,8 +415,8 @@ struct orlov_stats {
 * for a particular block group or flex_bg.  If flex_size is 1, then g
 * is a block group number; otherwise it is flex_bg number.
 */
-void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
-                       int flex_size, struct orlov_stats *stats)
+                            int flex_size, struct orlov_stats *stats)
 {
        struct ext4_group_desc *desc;
        struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
@@ -712,8 +716,17 @@ static int ext4_claim_inode(struct super_block *sb,
 {
        int free = 0, retval = 0, count;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+        /*
+         * We have to be sure that new inode allocation does not race with
+         * inode table initialization, because otherwise we may end up
+         * allocating and writing new inode right before sb_issue_zeroout
+         * takes place and overwriting our new inode with zeroes. So we
+         * take alloc_sem to prevent it.
+         */
+        down_read(&grp->alloc_sem);
        ext4_lock_group(sb, group);
        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
                /* not a free inode */
@@ -724,6 +737,7 @@ static int ext4_claim_inode(struct super_block *sb,
        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                        ino > EXT4_INODES_PER_GROUP(sb)) {
                ext4_unlock_group(sb, group);
+                up_read(&grp->alloc_sem);
                ext4_error(sb, "reserved inode or inode > inodes count - "
                           "block_group = %u, inode=%lu", group,
                           ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +786,7 @@ static int ext4_claim_inode(struct super_block *sb,
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
        ext4_unlock_group(sb, group);
+        up_read(&grp->alloc_sem);
        return retval;
 }
@@ -1205,3 +1220,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
        }
        return count;
 }
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_claim_inode until we are finished.
+ */
+extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+                                 int barrier)
+{
+        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *gdp = NULL;
+        struct buffer_head *group_desc_bh;
+        handle_t *handle;
+        ext4_fsblk_t blk;
+        int num, ret = 0, used_blks = 0;
+        /* This should not happen, but just to be sure check this */
+        if (sb->s_flags & MS_RDONLY) {
+                ret = 1;
+                goto out;
+        }
+        gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+        if (!gdp)
+                goto out;
+        /*
+         * We do not need to lock this, because we are the only one
+         * handling this flag.
+         */
+        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+                goto out;
+        handle = ext4_journal_start_sb(sb, 1);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        down_write(&grp->alloc_sem);
+        /*
+         * If inode bitmap was already initialized there may be some
+         * used inodes so we need to skip blocks with used inodes in
+         * inode table.
+         */
+        if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+                used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+                            ext4_itable_unused_count(sb, gdp)),
+                            sbi->s_inodes_per_block);
+        if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+                ext4_error(sb, "Something is wrong with group %u\n"
+                           "Used itable blocks: %d"
+                           "itable unused count: %u\n",
+                           group, used_blks,
+                           ext4_itable_unused_count(sb, gdp));
+                ret = 1;
+                goto out;
+        }
+        blk = ext4_inode_table(sb, gdp) + used_blks;
+        num = sbi->s_itb_per_group - used_blks;
+        BUFFER_TRACE(group_desc_bh, "get_write_access");
+        ret = ext4_journal_get_write_access(handle,
+                                            group_desc_bh);
+        if (ret)
+                goto err_out;
+        /*
+         * Skip zeroout if the inode table is full. But we set the ZEROED
+         * flag anyway, because obviously, when it is full it does not need
+         * further zeroing.
+         */
+        if (unlikely(num == 0))
+                goto skip_zeroout;
+        ext4_debug("going to zero out inode table in group %d\n",
+                   group);
+        ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
+        if (ret < 0)
+                goto err_out;
+        if (barrier)
+                blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+skip_zeroout:
+        ext4_lock_group(sb, group);
+        gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+        ext4_unlock_group(sb, group);
+        BUFFER_TRACE(group_desc_bh,
+                     "call ext4_handle_dirty_metadata");
+        ret = ext4_handle_dirty_metadata(handle, NULL,
+                                         group_desc_bh);
+err_out:
+        up_write(&grp->alloc_sem);
+        ext4_journal_stop(handle);
+out:
+        return ret;
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debeb3965..bdbe69902207 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -53,6 +53,7 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
+        trace_ext4_begin_ordered_truncate(inode, new_size);
        return jbd2_journal_begin_ordered_truncate(
                                        EXT4_SB(inode->i_sb)->s_journal,
                                        &EXT4_I(inode)->jinode,
@@ -60,6 +61,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create);
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 /*
 * Test whether an inode is a fast symlink.
@@ -172,6 +179,7 @@ void ext4_evict_inode(struct inode *inode)
        handle_t *handle;
        int err;
+        trace_ext4_evict_inode(inode);
        if (inode->i_nlink) {
                truncate_inode_pages(&inode->i_data, 0);
                goto no_delete;
@@ -755,6 +763,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                 * parent to disk.
                 */
                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+                if (unlikely(!bh)) {
+                        err = -EIO;
+                        goto failed;
+                }
                branch[n].bh = bh;
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
@@ -1207,8 +1220,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
                                break;
                        idx++;
                        num++;
-                        if (num >= max_pages)
+                        if (num >= max_pages) {
+                                done = 1;
                                break;
+                        }
                }
                pagevec_release(&pvec);
        }
@@ -1538,10 +1553,10 @@ static int do_journal_get_write_access(handle_t *handle,
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        /*
-         * __block_prepare_write() could have dirtied some buffers. Clean
+         * __block_write_begin() could have dirtied some buffers. Clean
         * the dirty bit as jbd2_journal_get_write_access() could complain
         * otherwise about fs integrity issues. Setting of the dirty bit
-         * by __block_prepare_write() isn't a real problem here as we clear
+         * by __block_write_begin() isn't a real problem here as we clear
         * the bit before releasing a page lock and thus writeback cannot
         * ever write the buffer.
         */
@@ -1995,16 +2010,23 @@ static void ext4_da_page_release_reservation(struct page *page,
 *
 * As pages are already locked by write_cache_pages(), we can't use it
 */
-static int mpage_da_submit_io(struct mpage_da_data *mpd)
+static int mpage_da_submit_io(struct mpage_da_data *mpd,
+                              struct ext4_map_blocks *map)
 {
-        long pages_skipped;
        struct pagevec pvec;
        unsigned long index, end;
        int ret = 0, err, nr_pages, i;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
+        loff_t size = i_size_read(inode);
+        unsigned int len, block_start;
+        struct buffer_head *bh, *page_bufs = NULL;
+        int journal_data = ext4_should_journal_data(inode);
+        sector_t pblock = 0, cur_logical = 0;
+        struct ext4_io_submit io_submit;
        BUG_ON(mpd->next_page <= mpd->first_page);
+        memset(&io_submit, 0, sizeof(io_submit));
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
@@ -2020,122 +2042,108 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
+                        int commit_write = 0, redirty_page = 0;
                        struct page *page = pvec.pages[i];
                        index = page->index;
                        if (index > end)
                                break;
+                        if (index == size >> PAGE_CACHE_SHIFT)
+                                len = size & ~PAGE_CACHE_MASK;
+                        else
+                                len = PAGE_CACHE_SIZE;
+                        if (map) {
+                                cur_logical = index << (PAGE_CACHE_SHIFT -
+                                                        inode->i_blkbits);
+                                pblock = map->m_pblk + (cur_logical -
+                                                        map->m_lblk);
+                        }
                        index++;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
-                        pages_skipped = mpd->wbc->pages_skipped;
-                        err = mapping->a_ops->writepage(page, mpd->wbc);
-                        if (!err && (pages_skipped == mpd->wbc->pages_skipped))
-                                /*
-                                 * have successfully written the page
-                                 * without skipping the same
-                                 */
-                                mpd->pages_written++;
                        /*
-                         * In error case, we have to continue because
+                         * If the page does not have buffers (for
-                         * remaining pages are still locked
+                         * whatever reason), try to create them using
-                         * XXX: unlock and re-dirty them?
+                         * __block_write_begin.  If this fails,
+                         * redirty the page and move on.
                         */
-                        if (ret == 0)
+                        if (!page_has_buffers(page)) {
-                                ret = err;
+                                if (__block_write_begin(page, 0, len,
-                }
+                                                noalloc_get_block_write)) {
-                pagevec_release(&pvec);
+                                redirty_page:
-        }
+                                        redirty_page_for_writepage(mpd->wbc,
-        return ret;
+                                                                   page);
-}
+                                        unlock_page(page);
+                                        continue;
-/*
+                                }
- * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+                                commit_write = 1;
- *
+                        }
- * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
- */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-                                 struct ext4_map_blocks *map)
-{
-        struct inode *inode = mpd->inode;
-        struct address_space *mapping = inode->i_mapping;
-        int blocks = map->m_len;
-        sector_t pblock = map->m_pblk, cur_logical;
-        struct buffer_head *head, *bh;
-        pgoff_t index, end;
-        struct pagevec pvec;
-        int nr_pages, i;
-        index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        pagevec_init(&pvec, 0);
-        while (index <= end) {
-                /* XXX: optimize tail */
-                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-                if (nr_pages == 0)
-                        break;
-                for (i = 0; i < nr_pages; i++) {
-                        struct page *page = pvec.pages[i];
-                        index = page->index;
-                        if (index > end)
-                                break;
-                        index++;
-                        BUG_ON(!PageLocked(page));
-                        BUG_ON(PageWriteback(page));
-                        BUG_ON(!page_has_buffers(page));
-                        bh = page_buffers(page);
-                        head = bh;
-                        /* skip blocks out of the range */
-                        do {
-                                if (cur_logical >= map->m_lblk)
-                                        break;
-                                cur_logical++;
-                        } while ((bh = bh->b_this_page) != head);
+                        bh = page_bufs = page_buffers(page);
+                        block_start = 0;
                        do {
-                                if (cur_logical >= map->m_lblk + blocks)
+                                if (!bh)
-                                        break;
+                                        goto redirty_page;
+                                if (map && (cur_logical >= map->m_lblk) &&
-                                if (buffer_delay(bh) || buffer_unwritten(bh)) {
+                                    (cur_logical <= (map->m_lblk +
+                                                     (map->m_len - 1)))) {
-                                        BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
                                        if (buffer_delay(bh)) {
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
-                                        } else {
-                                                /*
-                                                 * unwritten already should have
-                                                 * blocknr assigned. Verify that
-                                                 */
-                                                clear_buffer_unwritten(bh);
-                                                BUG_ON(bh->b_blocknr != pblock);
                                        }
+                                        if (buffer_unwritten(bh) ||
+                                            buffer_mapped(bh))
+                                                BUG_ON(bh->b_blocknr != pblock);
+                                        if (map->m_flags & EXT4_MAP_UNINIT)
+                                                set_buffer_uninit(bh);
+                                        clear_buffer_unwritten(bh);
+                                }
-                                } else if (buffer_mapped(bh))
+                                /* redirty page if block allocation undone */
-                                        BUG_ON(bh->b_blocknr != pblock);
+                                if (buffer_delay(bh) || buffer_unwritten(bh))
+                                        redirty_page = 1;
-                                if (map->m_flags & EXT4_MAP_UNINIT)
+                                bh = bh->b_this_page;
-                                        set_buffer_uninit(bh);
+                                block_start += bh->b_size;
                                cur_logical++;
                                pblock++;
-                        } while ((bh = bh->b_this_page) != head);
+                        } while (bh != page_bufs);
+                        if (redirty_page)
+                                goto redirty_page;
+                        if (commit_write)
+                                /* mark the buffer_heads as dirty & uptodate */
+                                block_commit_write(page, 0, len);
+                        /*
+                         * Delalloc doesn't support data journalling,
+                         * but eventually maybe we'll lift this
+                         * restriction.
+                         */
+                        if (unlikely(journal_data && PageChecked(page)))
+                                err = __ext4_journalled_writepage(page, len);
+                        else
+                                err = ext4_bio_write_page(&io_submit, page,
+                                                          len, mpd->wbc);
+                        if (!err)
+                                mpd->pages_written++;
+                        /*
+                         * In error case, we have to continue because
+                         * remaining pages are still locked
+                         */
+                        if (ret == 0)
+                                ret = err;
                }
                pagevec_release(&pvec);
        }
+        ext4_io_submit(&io_submit);
+        return ret;
 }
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                                        sector_t logical, long blk_cnt)
 {
@@ -2187,35 +2195,32 @@ static void ext4_print_free_blocks(struct inode *inode)
 }
 /*
- * mpage_da_map_blocks - go through given space
+ * mpage_da_map_and_submit - go through given space, map them
+ *       if necessary, and then submit them for I/O
 *
 * @mpd - bh describing space
 *
 * The function skips space we know is already mapped to disk blocks.
 *
 */
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 {
        int err, blks, get_blocks_flags;
-        struct ext4_map_blocks map;
+        struct ext4_map_blocks map, *mapp = NULL;
        sector_t next = mpd->b_blocknr;
        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
        handle_t *handle = NULL;
        /*
-         * We consider only non-mapped and non-allocated blocks
+         * If the blocks are mapped already, or we couldn't accumulate
+         * any blocks, then proceed immediately to the submission stage.
         */
-        if ((mpd->b_state  & (1 << BH_Mapped)) &&
+        if ((mpd->b_size == 0) ||
-                !(mpd->b_state & (1 << BH_Delay)) &&
+            ((mpd->b_state  & (1 << BH_Mapped)) &&
-                !(mpd->b_state & (1 << BH_Unwritten)))
+             !(mpd->b_state & (1 << BH_Delay)) &&
-                return 0;
+             !(mpd->b_state & (1 << BH_Unwritten))))
+                goto submit_io;
-        /*
-         * If we didn't accumulate anything to write simply return
-         */
-        if (!mpd->b_size)
-                return 0;
        handle = ext4_journal_current_handle();
        BUG_ON(!handle);
@@ -2252,17 +2257,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                err = blks;
                /*
-                 * If get block returns with error we simply
+                 * If get block returns EAGAIN or ENOSPC and there
-                 * return. Later writepage will redirty the page and
+                 * appears to be free blocks we will call
-                 * writepages will find the dirty page again
+                 * ext4_writepage() for all of the pages which will
+                 * just redirty the pages.
                 */
                if (err == -EAGAIN)
-                        return 0;
+                        goto submit_io;
                if (err == -ENOSPC &&
                    ext4_count_free_blocks(sb)) {
                        mpd->retval = err;
-                        return 0;
+                        goto submit_io;
                }
                /*
@@ -2287,10 +2293,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                /* invalidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
                                mpd->b_size >> mpd->inode->i_blkbits);
-                return err;
+                return;
        }
        BUG_ON(blks == 0);
+        mapp = &map;
        if (map.m_flags & EXT4_MAP_NEW) {
                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
                int i;
@@ -2299,18 +2306,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                        unmap_underlying_metadata(bdev, map.m_pblk + i);
        }
-        /*
-         * If blocks are delayed marked, we need to
-         * put actual blocknr and drop delayed bit
-         */
-        if ((mpd->b_state & (1 << BH_Delay)) ||
-            (mpd->b_state & (1 << BH_Unwritten)))
-                mpage_put_bnr_to_bhs(mpd, &map);
        if (ext4_should_order_data(mpd->inode)) {
                err = ext4_jbd2_file_inode(handle, mpd->inode);
                if (err)
-                        return err;
+                        /* This only happens if the journal is aborted */
+                        return;
        }
        /*
@@ -2321,10 +2321,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                disksize = i_size_read(mpd->inode);
        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
                ext4_update_i_disksize(mpd->inode, disksize);
-                return ext4_mark_inode_dirty(handle, mpd->inode);
+                err = ext4_mark_inode_dirty(handle, mpd->inode);
+                if (err)
+                        ext4_error(mpd->inode->i_sb,
+                                   "Failed to mark inode %lu dirty",
+                                   mpd->inode->i_ino);
        }
-        return 0;
+submit_io:
+        mpage_da_submit_io(mpd, mapp);
+        mpd->io_done = 1;
 }
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2401,9 +2407,7 @@ flush_it:
         * We couldn't merge the block to our extent, so we
         * need to flush current  extent and start new one
         */
-        if (mpage_da_map_blocks(mpd) == 0)
+        mpage_da_map_and_submit(mpd);
-                mpage_da_submit_io(mpd);
-        mpd->io_done = 1;
        return;
 }
@@ -2422,9 +2426,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 * The function finds extents of pages and scan them for all blocks.
 */
 static int __mpage_da_writepage(struct page *page,
-                                struct writeback_control *wbc, void *data)
+                                struct writeback_control *wbc,
+                                struct mpage_da_data *mpd)
 {
-        struct mpage_da_data *mpd = data;
        struct inode *inode = mpd->inode;
        struct buffer_head *bh, *head;
        sector_t logical;
@@ -2435,15 +2439,13 @@ static int __mpage_da_writepage(struct page *page,
        if (mpd->next_page != page->index) {
                /*
                 * Nope, we can't. So, we map non-allocated blocks
-                 * and start IO on them using writepage()
+                 * and start IO on them
                 */
                if (mpd->next_page != mpd->first_page) {
-                        if (mpage_da_map_blocks(mpd) == 0)
+                        mpage_da_map_and_submit(mpd);
-                                mpage_da_submit_io(mpd);
                        /*
                         * skip rest of the page in the page_vec
                         */
-                        mpd->io_done = 1;
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return MPAGE_DA_EXTENT_TAIL;
@@ -2550,8 +2552,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                if (buffer_delay(bh))
                        return 0; /* Not sure this could or should happen */
                /*
-                 * XXX: __block_prepare_write() unmaps passed block,
+                 * XXX: __block_write_begin() unmaps passed block, is it OK?
-                 * is it OK?
                 */
                ret = ext4_da_reserve_space(inode, iblock);
                if (ret)
@@ -2583,7 +2584,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 /*
 * This function is used as a standard get_block_t calback function
 * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_prepare_write() and block_write_full_page().
+ * callback function for block_write_begin() and block_write_full_page().
 * These functions should only try to map a single block at a time.
 *
 * Since this function doesn't do block allocations even if the caller
@@ -2623,6 +2624,7 @@ static int __ext4_journalled_writepage(struct page *page,
        int ret = 0;
        int err;
+        ClearPageChecked(page);
        page_bufs = page_buffers(page);
        BUG_ON(!page_bufs);
        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2700,7 +2702,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int ext4_writepage(struct page *page,
                          struct writeback_control *wbc)
 {
-        int ret = 0;
+        int ret = 0, commit_write = 0;
        loff_t size;
        unsigned int len;
        struct buffer_head *page_bufs = NULL;
@@ -2713,71 +2715,44 @@ static int ext4_writepage(struct page *page,
        else
                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
+        /*
-                page_bufs = page_buffers(page);
+         * If the page does not have buffers (for whatever reason),
-                if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+         * try to create them using __block_write_begin.  If this
-                                        ext4_bh_delay_or_unwritten)) {
+         * fails, redirty the page and move on.
-                        /*
+         */
-                         * We don't want to do  block allocation
+        if (!page_has_buffers(page)) {
-                         * So redirty the page and return
+                if (__block_write_begin(page, 0, len,
-                         * We may reach here when we do a journal commit
+                                        noalloc_get_block_write)) {
-                         * via journal_submit_inode_data_buffers.
+                redirty_page:
-                         * If we don't have mapping block we just ignore
-                         * them. We can also reach here via shrink_page_list
-                         */
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return 0;
                }
-        } else {
+                commit_write = 1;
+        }
+        page_bufs = page_buffers(page);
+        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                              ext4_bh_delay_or_unwritten)) {
                /*
-                 * The test for page_has_buffers() is subtle:
+                 * We don't want to do block allocation, so redirty
-                 * We know the page is dirty but it lost buffers. That means
+                 * the page and return.  We may reach here when we do
-                 * that at some moment in time after write_begin()/write_end()
+                 * a journal commit via journal_submit_inode_data_buffers.
-                 * has been called all buffers have been clean and thus they
+                 * We can also reach here via shrink_page_list
-                 * must have been written at least once. So they are all
-                 * mapped and we can happily proceed with mapping them
-                 * and writing the page.
-                 *
-                 * Try to initialize the buffer_heads and check whether
-                 * all are mapped and non delay. We don't want to
-                 * do block allocation here.
                 */
-                ret = block_prepare_write(page, 0, len,
+                goto redirty_page;
-                                          noalloc_get_block_write);
+        }
-                if (!ret) {
+        if (commit_write)
-                        page_bufs = page_buffers(page);
-                        /* check whether all are mapped and non delay */
-                        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                                ext4_bh_delay_or_unwritten)) {
-                                redirty_page_for_writepage(wbc, page);
-                                unlock_page(page);
-                                return 0;
-                        }
-                } else {
-                        /*
-                         * We can't do block allocation here
-                         * so just redity the page and unlock
-                         * and return
-                         */
-                        redirty_page_for_writepage(wbc, page);
-                        unlock_page(page);
-                        return 0;
-                }
                /* now mark the buffer_heads as dirty and uptodate */
                block_commit_write(page, 0, len);
-        }
-        if (PageChecked(page) && ext4_should_journal_data(inode)) {
+        if (PageChecked(page) && ext4_should_journal_data(inode))
                /*
                 * It's mmapped pagecache.  Add buffers and journal it.  There
                 * doesn't seem much point in redirtying the page here.
                 */
-                ClearPageChecked(page);
                return __ext4_journalled_writepage(page, len);
-        }
-        if (page_bufs && buffer_uninit(page_bufs)) {
+        if (buffer_uninit(page_bufs)) {
                ext4_set_bh_endio(page_bufs, inode);
                ret = block_write_full_page_endio(page, noalloc_get_block_write,
                                            wbc, ext4_end_io_buffer_write);
@@ -2824,25 +2799,32 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 */
 static int write_cache_pages_da(struct address_space *mapping,
                                struct writeback_control *wbc,
-                                struct mpage_da_data *mpd)
+                                struct mpage_da_data *mpd,
+                                pgoff_t *done_index)
 {
        int ret = 0;
        int done = 0;
        struct pagevec pvec;
-        int nr_pages;
+        unsigned nr_pages;
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        long nr_to_write = wbc->nr_to_write;
+        int tag;
        pagevec_init(&pvec, 0);
        index = wbc->range_start >> PAGE_CACHE_SHIFT;
        end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag = PAGECACHE_TAG_TOWRITE;
+        else
+                tag = PAGECACHE_TAG_DIRTY;
+        *done_index = index;
        while (!done && (index <= end)) {
                int i;
-                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-                              PAGECACHE_TAG_DIRTY,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
                        break;
@@ -2862,6 +2844,8 @@ static int write_cache_pages_da(struct address_space *mapping,
                                break;
                        }
+                        *done_index = page->index + 1;
                        lock_page(page);
                        /*
@@ -2947,6 +2931,8 @@ static int ext4_da_writepages(struct address_space *mapping,
        long desired_nr_to_write, nr_to_writebump = 0;
        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+        pgoff_t done_index = 0;
+        pgoff_t end;
        trace_ext4_da_writepages(inode, wbc);
@@ -2982,8 +2968,11 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->range_start = index << PAGE_CACHE_SHIFT;
                wbc->range_end  = LLONG_MAX;
                wbc->range_cyclic = 0;
-        } else
+                end = -1;
+        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        }
        /*
         * This works around two forms of stupidity.  The first is in
@@ -3002,9 +2991,12 @@ static int ext4_da_writepages(struct address_space *mapping,
         * sbi->max_writeback_mb_bump whichever is smaller.
         */
        max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-        if (!range_cyclic && range_whole)
+        if (!range_cyclic && range_whole) {
-                desired_nr_to_write = wbc->nr_to_write * 8;
+                if (wbc->nr_to_write == LONG_MAX)
-        else
+                        desired_nr_to_write = wbc->nr_to_write;
+                else
+                        desired_nr_to_write = wbc->nr_to_write * 8;
+        } else
                desired_nr_to_write = ext4_num_dirty_pages(inode, index,
                                                           max_pages);
        if (desired_nr_to_write > max_pages)
@@ -3021,6 +3013,9 @@ static int ext4_da_writepages(struct address_space *mapping,
        pages_skipped = wbc->pages_skipped;
 retry:
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag_pages_for_writeback(mapping, index, end);
        while (!ret && wbc->nr_to_write > 0) {
                /*
@@ -3059,16 +3054,14 @@ retry:
                mpd.io_done = 0;
                mpd.pages_written = 0;
                mpd.retval = 0;
-                ret = write_cache_pages_da(mapping, wbc, &mpd);
+                ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
                 * them for I/O.
                 */
                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-                        if (mpage_da_map_blocks(&mpd) == 0)
+                        mpage_da_map_and_submit(&mpd);
-                                mpage_da_submit_io(&mpd);
-                        mpd.io_done = 1;
                        ret = MPAGE_DA_EXTENT_TAIL;
                }
                trace_ext4_da_write_pages(inode, &mpd);
@@ -3115,14 +3108,13 @@ retry:
                         __func__, wbc->nr_to_write, ret);
        /* Update index */
-        index += pages_written;
        wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
-                mapping->writeback_index = index;
+                mapping->writeback_index = done_index;
 out_writepages:
        wbc->nr_to_write -= nr_to_writebump;
@@ -3457,15 +3449,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
-static void ext4_free_io_end(ext4_io_end_t *io)
-{
-        BUG_ON(!io);
-        if (io->page)
-                put_page(io->page);
-        iput(io->inode);
-        kfree(io);
-}
 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
 {
        struct buffer_head *head, *bh;
@@ -3642,173 +3625,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef  EXT4_DEBUG
-        struct list_head *cur, *before, *after;
-        ext4_io_end_t *io, *io0, *io1;
-        unsigned long flags;
-        if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-                ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
-                return;
-        }
-        ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-        list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
-                cur = &io->list;
-                before = cur->prev;
-                io0 = container_of(before, ext4_io_end_t, list);
-                after = cur->next;
-                io1 = container_of(after, ext4_io_end_t, list);
-                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-                            io, inode->i_ino, io0, io1);
-        }
-        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-/*
- * check a range of space and convert unwritten extents to written.
- */
-static int ext4_end_io_nolock(ext4_io_end_t *io)
-{
-        struct inode *inode = io->inode;
-        loff_t offset = io->offset;
-        ssize_t size = io->size;
-        int ret = 0;
-        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
-                   "list->prev 0x%p\n",
-                   io, inode->i_ino, io->list.next, io->list.prev);
-        if (list_empty(&io->list))
-                return ret;
-        if (io->flag != EXT4_IO_UNWRITTEN)
-                return ret;
-        ret = ext4_convert_unwritten_extents(inode, offset, size);
-        if (ret < 0) {
-                printk(KERN_EMERG "%s: failed to convert unwritten"
-                        "extents to written extents, error is %d"
-                        " io is still on inode %lu aio dio list\n",
-                       __func__, ret, inode->i_ino);
-                return ret;
-        }
-        if (io->iocb)
-                aio_complete(io->iocb, io->result, 0);
-        /* clear the DIO AIO unwritten flag */
-        io->flag = 0;
-        return ret;
-}
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
-{
-        ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
-        struct inode            *inode = io->inode;
-        struct ext4_inode_info  *ei = EXT4_I(inode);
-        unsigned long           flags;
-        int                     ret;
-        mutex_lock(&inode->i_mutex);
-        ret = ext4_end_io_nolock(io);
-        if (ret < 0) {
-                mutex_unlock(&inode->i_mutex);
-                return;
-        }
-        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        if (!list_empty(&io->list))
-                list_del_init(&io->list);
-        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-        mutex_unlock(&inode->i_mutex);
-        ext4_free_io_end(io);
-}
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int flush_completed_IO(struct inode *inode)
-{
-        ext4_io_end_t *io;
-        struct ext4_inode_info *ei = EXT4_I(inode);
-        unsigned long flags;
-        int ret = 0;
-        int ret2 = 0;
-        if (list_empty(&ei->i_completed_io_list))
-                return ret;
-        dump_completed_IO(inode);
-        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        while (!list_empty(&ei->i_completed_io_list)){
-                io = list_entry(ei->i_completed_io_list.next,
-                                ext4_io_end_t, list);
-                /*
-                 * Calling ext4_end_io_nolock() to convert completed
-                 * IO to written.
-                 *
-                 * When ext4_sync_file() is called, run_queue() may already
-                 * about to flush the work corresponding to this io structure.
-                 * It will be upset if it founds the io structure related
-                 * to the work-to-be schedule is freed.
-                 *
-                 * Thus we need to keep the io structure still valid here after
-                 * convertion finished. The io structure has a flag to
-                 * avoid double converting from both fsync and background work
-                 * queue work.
-                 */
-                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-                ret = ext4_end_io_nolock(io);
-                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-                if (ret < 0)
-                        ret2 = ret;
-                else
-                        list_del_init(&io->list);
-        }
-        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-        return (ret2 < 0) ? ret2 : 0;
-}
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
-{
-        ext4_io_end_t *io = NULL;
-        io = kmalloc(sizeof(*io), flags);
-        if (io) {
-                igrab(inode);
-                io->inode = inode;
-                io->flag = 0;
-                io->offset = 0;
-                io->size = 0;
-                io->page = NULL;
-                io->iocb = NULL;
-                io->result = 0;
-                INIT_WORK(&io->work, ext4_end_io_work);
-                INIT_LIST_HEAD(&io->list);
-        }
-        return io;
-}
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                            ssize_t size, void *private, int ret,
                            bool is_async)
@@ -3828,7 +3644,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  size);
        /* if not aio dio with unwritten extents, just free io and return */
-        if (io_end->flag != EXT4_IO_UNWRITTEN){
+        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                ext4_free_io_end(io_end);
                iocb->private = NULL;
 out:
@@ -3845,14 +3661,14 @@ out:
        }
        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-        /* queue the work to convert unwritten extents to written */
-        queue_work(wq, &io_end->work);
        /* Add the io_end to per-inode completed aio dio list*/
        ei = EXT4_I(io_end->inode);
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        list_add_tail(&io_end->list, &ei->i_completed_io_list);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        /* queue the work to convert unwritten extents to written */
+        queue_work(wq, &io_end->work);
        iocb->private = NULL;
 }
@@ -3873,7 +3689,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
                goto out;
        }
-        io_end->flag = EXT4_IO_UNWRITTEN;
+        io_end->flag = EXT4_IO_END_UNWRITTEN;
        inode = io_end->inode;
        /* Add the io_end to per-inode completed io list*/
@@ -5464,6 +5280,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        int error, rc = 0;
+        int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;
        error = inode_change_ok(inode, attr);
@@ -5519,8 +5336,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        error = PTR_ERR(handle);
                        goto err_out;
                }
+                if (ext4_handle_valid(handle)) {
-                error = ext4_orphan_add(handle, inode);
+                        error = ext4_orphan_add(handle, inode);
+                        orphan = 1;
+                }
                EXT4_I(inode)->i_disksize = attr->ia_size;
                rc = ext4_mark_inode_dirty(handle, inode);
                if (!error)
@@ -5538,6 +5357,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                        goto err_out;
                                }
                                ext4_orphan_del(handle, inode);
+                                orphan = 0;
                                ext4_journal_stop(handle);
                                goto err_out;
                        }
@@ -5560,7 +5380,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
         * If the call to ext4_truncate failed to get a transaction handle at
         * all, we need to clean up the in-core orphan list manually.
         */
-        if (inode->i_nlink)
+        if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);
        if (!rc && (ia_valid & ATTR_MODE))
@@ -5592,9 +5412,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
         * will return the blocks that include the delayed allocation
         * blocks for this file.
         */
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
        return 0;
@@ -5643,7 +5461,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 *
 * Also account for superblock, inode, quota and xattr blocks
 */
-int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
@@ -5831,6 +5649,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
        int err, ret;
        might_sleep();
+        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (ext4_handle_valid(handle) &&
            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4b4ad4b7ce57..5b4d4e3a4d58 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -338,6 +338,14 @@
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_ext_cachep;
+/* We create slab caches for groupinfo data structures based on the
+ * superblock block size.  There will be one per mounted filesystem for
+ * each unique s_blocksize_bits */
+#define NR_GRPINFO_CACHES       \
+        (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
+static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -939,6 +947,85 @@ out:
 }
 /*
+ * lock the group_info alloc_sem of all the groups
+ * belonging to the same buddy cache page. This
+ * make sure other parallel operation on the buddy
+ * cache doesn't happen  whild holding the buddy cache
+ * lock
+ */
+static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
+                                        ext4_group_t group)
+{
+        int i;
+        int block, pnum;
+        int blocks_per_page;
+        int groups_per_page;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
+        ext4_group_t first_group;
+        struct ext4_group_info *grp;
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        first_group = pnum * blocks_per_page / 2;
+        groups_per_page = blocks_per_page >> 1;
+        if (groups_per_page == 0)
+                groups_per_page = 1;
+        /* read all groups the page covers into the cache */
+        for (i = 0; i < groups_per_page; i++) {
+                if ((first_group + i) >= ngroups)
+                        break;
+                grp = ext4_get_group_info(sb, first_group + i);
+                /* take all groups write allocation
+                 * semaphore. This make sure there is
+                 * no block allocation going on in any
+                 * of that groups
+                 */
+                down_write_nested(&grp->alloc_sem, i);
+        }
+        return i;
+}
+static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+                                         ext4_group_t group, int locked_group)
+{
+        int i;
+        int block, pnum;
+        int blocks_per_page;
+        ext4_group_t first_group;
+        struct ext4_group_info *grp;
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        first_group = pnum * blocks_per_page / 2;
+        /* release locks on all the groups */
+        for (i = 0; i < locked_group; i++) {
+                grp = ext4_get_group_info(sb, first_group + i);
+                /* take all groups write allocation
+                 * semaphore. This make sure there is
+                 * no block allocation going on in any
+                 * of that groups
+                 */
+                up_write(&grp->alloc_sem);
+        }
+}
+/*
 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
 * block group lock of all groups for this page; do not hold the BG lock when
 * calling this routine!
@@ -1915,84 +2002,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        return 0;
 }
-/*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen  whild holding the buddy cache
- * lock
- */
-int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
-{
-        int i;
-        int block, pnum;
-        int blocks_per_page;
-        int groups_per_page;
-        ext4_group_t ngroups = ext4_get_groups_count(sb);
-        ext4_group_t first_group;
-        struct ext4_group_info *grp;
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-        /*
-         * the buddy cache inode stores the block bitmap
-         * and buddy information in consecutive blocks.
-         * So for each group we need two blocks.
-         */
-        block = group * 2;
-        pnum = block / blocks_per_page;
-        first_group = pnum * blocks_per_page / 2;
-        groups_per_page = blocks_per_page >> 1;
-        if (groups_per_page == 0)
-                groups_per_page = 1;
-        /* read all groups the page covers into the cache */
-        for (i = 0; i < groups_per_page; i++) {
-                if ((first_group + i) >= ngroups)
-                        break;
-                grp = ext4_get_group_info(sb, first_group + i);
-                /* take all groups write allocation
-                 * semaphore. This make sure there is
-                 * no block allocation going on in any
-                 * of that groups
-                 */
-                down_write_nested(&grp->alloc_sem, i);
-        }
-        return i;
-}
-void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
-                                        ext4_group_t group, int locked_group)
-{
-        int i;
-        int block, pnum;
-        int blocks_per_page;
-        ext4_group_t first_group;
-        struct ext4_group_info *grp;
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-        /*
-         * the buddy cache inode stores the block bitmap
-         * and buddy information in consecutive blocks.
-         * So for each group we need two blocks.
-         */
-        block = group * 2;
-        pnum = block / blocks_per_page;
-        first_group = pnum * blocks_per_page / 2;
-        /* release locks on all the groups */
-        for (i = 0; i < locked_group; i++) {
-                grp = ext4_get_group_info(sb, first_group + i);
-                /* take all groups write allocation
-                 * semaphore. This make sure there is
-                 * no block allocation going on in any
-                 * of that groups
-                 */
-                up_write(&grp->alloc_sem);
-        }
-}
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -2233,15 +2242,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
        .release        = seq_release,
 };
+static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+{
+        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+        BUG_ON(!cachep);
+        return cachep;
+}
 /* Create and initialize ext4_group_info data for the given group. */
 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                          struct ext4_group_desc *desc)
 {
-        int i, len;
+        int i;
        int metalen = 0;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info **meta_group_info;
+        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        /*
         * First check if this group is the first of a reserved block.
@@ -2261,22 +2279,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                        meta_group_info;
        }
-        /*
-         * calculate needed size. if change bb_counters size,
-         * don't forget about ext4_mb_generate_buddy()
-         */
-        len = offsetof(typeof(**meta_group_info),
-                       bb_counters[sb->s_blocksize_bits + 2]);
        meta_group_info =
                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
-        meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+        meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
        if (meta_group_info[i] == NULL) {
                printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
                goto exit_group_info;
        }
+        memset(meta_group_info[i], 0, kmem_cache_size(cachep));
        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
                &(meta_group_info[i]->bb_state));
@@ -2331,6 +2343,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        int num_meta_group_infos_max;
        int array_size;
        struct ext4_group_desc *desc;
+        struct kmem_cache *cachep;
        /* This is the number of blocks used by GDT */
        num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
@@ -2373,6 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
                printk(KERN_ERR "EXT4-fs: can't get new inode\n");
                goto err_freesgi;
        }
+        sbi->s_buddy_cache->i_ino = get_next_ino();
        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
        for (i = 0; i < ngroups; i++) {
                desc = ext4_get_group_desc(sb, i, NULL);
@@ -2388,8 +2402,9 @@ static int ext4_mb_init_backend(struct super_block *sb)
        return 0;
 err_freebuddy:
+        cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        while (i-- > 0)
-                kfree(ext4_get_group_info(sb, i));
+                kmem_cache_free(cachep, ext4_get_group_info(sb, i));
        i = num_meta_group_infos;
        while (i-- > 0)
                kfree(sbi->s_group_info[i]);
@@ -2406,19 +2421,48 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        unsigned offset;
        unsigned max;
        int ret;
+        int cache_index;
+        struct kmem_cache *cachep;
+        char *namep = NULL;
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_offsets == NULL) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto out;
        }
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
-                kfree(sbi->s_mb_offsets);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto out;
+        }
+        cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        cachep = ext4_groupinfo_caches[cache_index];
+        if (!cachep) {
+                char name[32];
+                int len = offsetof(struct ext4_group_info,
+                                        bb_counters[sb->s_blocksize_bits + 2]);
+                sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
+                namep = kstrdup(name, GFP_KERNEL);
+                if (!namep) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                /* Need to free the kmem_cache_name() when we
+                 * destroy the slab */
+                cachep = kmem_cache_create(namep, len, 0,
+                                             SLAB_RECLAIM_ACCOUNT, NULL);
+                if (!cachep) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ext4_groupinfo_caches[cache_index] = cachep;
        }
        /* order 0 is regular bitmap */
@@ -2439,9 +2483,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        /* init file for buddy data */
        ret = ext4_mb_init_backend(sb);
        if (ret != 0) {
-                kfree(sbi->s_mb_offsets);
+                goto out;
-                kfree(sbi->s_mb_maxs);
-                return ret;
        }
        spin_lock_init(&sbi->s_md_lock);
@@ -2456,9 +2498,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
        if (sbi->s_locality_groups == NULL) {
-                kfree(sbi->s_mb_offsets);
+                ret = -ENOMEM;
-                kfree(sbi->s_mb_maxs);
+                goto out;
-                return -ENOMEM;
        }
        for_each_possible_cpu(i) {
                struct ext4_locality_group *lg;
@@ -2475,7 +2516,13 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        if (sbi->s_journal)
                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-        return 0;
+out:
+        if (ret) {
+                kfree(sbi->s_mb_offsets);
+                kfree(sbi->s_mb_maxs);
+                kfree(namep);
+        }
+        return ret;
 }
 /* need to called with the ext4 group lock held */
@@ -2503,6 +2550,7 @@ int ext4_mb_release(struct super_block *sb)
        int num_meta_group_infos;
        struct ext4_group_info *grinfo;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        if (sbi->s_group_info) {
                for (i = 0; i < ngroups; i++) {
@@ -2513,7 +2561,7 @@ int ext4_mb_release(struct super_block *sb)
                        ext4_lock_group(sb, i);
                        ext4_mb_cleanup_pa(grinfo);
                        ext4_unlock_group(sb, i);
-                        kfree(grinfo);
+                        kmem_cache_free(cachep, grinfo);
                }
                num_meta_group_infos = (ngroups +
                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
@@ -2557,7 +2605,7 @@ int ext4_mb_release(struct super_block *sb)
        return 0;
 }
-static inline void ext4_issue_discard(struct super_block *sb,
+static inline int ext4_issue_discard(struct super_block *sb,
                ext4_group_t block_group, ext4_grpblk_t block, int count)
 {
        int ret;
@@ -2566,11 +2614,12 @@ static inline void ext4_issue_discard(struct super_block *sb,
        discard_block = block + ext4_group_first_block_no(sb, block_group);
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);
-        ret = sb_issue_discard(sb, discard_block, count);
+        ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
-        if (ret == EOPNOTSUPP) {
+        if (ret == -EOPNOTSUPP) {
                ext4_warning(sb, "discard not supported, disabling");
                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
        }
+        return ret;
 }
 /*
@@ -2658,28 +2707,22 @@ static void ext4_remove_debugfs_entry(void)
 #endif
-int __init init_ext4_mballoc(void)
+int __init ext4_init_mballoc(void)
 {
-        ext4_pspace_cachep =
+        ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
-                kmem_cache_create("ext4_prealloc_space",
+                                        SLAB_RECLAIM_ACCOUNT);
-                                     sizeof(struct ext4_prealloc_space),
-                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
        if (ext4_pspace_cachep == NULL)
                return -ENOMEM;
-        ext4_ac_cachep =
+        ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
-                kmem_cache_create("ext4_alloc_context",
+                                    SLAB_RECLAIM_ACCOUNT);
-                                     sizeof(struct ext4_allocation_context),
-                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
        if (ext4_ac_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                return -ENOMEM;
        }
-        ext4_free_ext_cachep =
+        ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
-                kmem_cache_create("ext4_free_block_extents",
+                                          SLAB_RECLAIM_ACCOUNT);
-                                     sizeof(struct ext4_free_data),
-                                     0, SLAB_RECLAIM_ACCOUNT, NULL);
        if (ext4_free_ext_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                kmem_cache_destroy(ext4_ac_cachep);
@@ -2689,8 +2732,9 @@ int __init init_ext4_mballoc(void)
        return 0;
 }
-void exit_ext4_mballoc(void)
+void ext4_exit_mballoc(void)
 {
+        int i;
        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
@@ -2699,6 +2743,15 @@ void exit_ext4_mballoc(void)
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_ext_cachep);
+        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+                struct kmem_cache *cachep = ext4_groupinfo_caches[i];
+                if (cachep) {
+                        char *name = (char *)kmem_cache_name(cachep);
+                        kmem_cache_destroy(cachep);
+                        kfree(name);
+                }
+        }
        ext4_remove_debugfs_entry();
 }
@@ -3535,8 +3588,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
 */
 static noinline_for_stack int
 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
-                        struct ext4_prealloc_space *pa,
+                        struct ext4_prealloc_space *pa)
-                        struct ext4_allocation_context *ac)
 {
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3554,11 +3606,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;
-        if (ac) {
-                ac->ac_sb = sb;
-                ac->ac_inode = pa->pa_inode;
-        }
        while (bit < end) {
                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
                if (bit >= end)
@@ -3569,16 +3616,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                         (unsigned) next - bit, (unsigned) group);
                free += next - bit;
-                if (ac) {
+                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
-                        ac->ac_b_ex.fe_group = group;
+                trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
-                        ac->ac_b_ex.fe_start = bit;
+                                               grp_blk_start + bit, next - bit);
-                        ac->ac_b_ex.fe_len = next - bit;
-                        ac->ac_b_ex.fe_logical = 0;
-                        trace_ext4_mballoc_discard(ac);
-                }
-                trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
-                                               next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
@@ -3601,29 +3641,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 static noinline_for_stack int
 ext4_mb_release_group_pa(struct ext4_buddy *e4b,
-                                struct ext4_prealloc_space *pa,
+                                struct ext4_prealloc_space *pa)
-                                struct ext4_allocation_context *ac)
 {
        struct super_block *sb = e4b->bd_sb;
        ext4_group_t group;
        ext4_grpblk_t bit;
-        trace_ext4_mb_release_group_pa(sb, ac, pa);
+        trace_ext4_mb_release_group_pa(sb, pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
+        trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
-        if (ac) {
-                ac->ac_sb = sb;
-                ac->ac_inode = NULL;
-                ac->ac_b_ex.fe_group = group;
-                ac->ac_b_ex.fe_start = bit;
-                ac->ac_b_ex.fe_len = pa->pa_len;
-                ac->ac_b_ex.fe_logical = 0;
-                trace_ext4_mballoc_discard(ac);
-        }
        return 0;
 }
@@ -3644,7 +3674,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
-        struct ext4_allocation_context *ac;
        struct list_head list;
        struct ext4_buddy e4b;
        int err;
@@ -3673,9 +3702,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
                needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
        INIT_LIST_HEAD(&list);
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac)
-                ac->ac_sb = sb;
 repeat:
        ext4_lock_group(sb, group);
        list_for_each_entry_safe(pa, tmp,
@@ -3730,9 +3756,9 @@ repeat:
                spin_unlock(pa->pa_obj_lock);
                if (pa->pa_type == MB_GROUP_PA)
-                        ext4_mb_release_group_pa(&e4b, pa, ac);
+                        ext4_mb_release_group_pa(&e4b, pa);
                else
-                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3740,8 +3766,6 @@ repeat:
 out:
        ext4_unlock_group(sb, group);
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
        ext4_mb_unload_buddy(&e4b);
        put_bh(bitmap_bh);
        return free;
@@ -3762,7 +3786,6 @@ void ext4_discard_preallocations(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
-        struct ext4_allocation_context *ac;
        ext4_group_t group = 0;
        struct list_head list;
        struct ext4_buddy e4b;
@@ -3778,11 +3801,6 @@ void ext4_discard_preallocations(struct inode *inode)
        INIT_LIST_HEAD(&list);
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac) {
-                ac->ac_sb = sb;
-                ac->ac_inode = inode;
-        }
 repeat:
        /* first, collect all pa's in the inode */
        spin_lock(&ei->i_prealloc_lock);
@@ -3852,7 +3870,7 @@ repeat:
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
-                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                ext4_unlock_group(sb, group);
                ext4_mb_unload_buddy(&e4b);
@@ -3861,8 +3879,6 @@ repeat:
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
 }
 /*
@@ -4060,14 +4076,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
        struct ext4_buddy e4b;
        struct list_head discard_list;
        struct ext4_prealloc_space *pa, *tmp;
-        struct ext4_allocation_context *ac;
        mb_debug(1, "discard locality group preallocation\n");
        INIT_LIST_HEAD(&discard_list);
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac)
-                ac->ac_sb = sb;
        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4119,15 +4131,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                }
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
-                ext4_mb_release_group_pa(&e4b, pa, ac);
+                ext4_mb_release_group_pa(&e4b, pa);
                ext4_unlock_group(sb, group);
                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
 }
 /*
@@ -4491,7 +4501,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 {
        struct buffer_head *bitmap_bh = NULL;
        struct super_block *sb = inode->i_sb;
-        struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
        unsigned long freed = 0;
        unsigned int overflow;
@@ -4531,6 +4540,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        if (!bh)
                                tbh = sb_find_get_block(inode->i_sb,
                                                        block + i);
+                        if (unlikely(!tbh))
+                                continue;
                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                    inode, tbh, block + i);
                }
@@ -4546,12 +4557,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        if (!ext4_should_writeback_data(inode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
-        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-        if (ac) {
-                ac->ac_inode = inode;
-                ac->ac_sb = sb;
-        }
 do_more:
        overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4609,12 +4614,7 @@ do_more:
                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
        }
 #endif
-        if (ac) {
+        trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
-                ac->ac_b_ex.fe_group = block_group;
-                ac->ac_b_ex.fe_start = bit;
-                ac->ac_b_ex.fe_len = count;
-                trace_ext4_mballoc_free(ac);
-        }
        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
@@ -4644,8 +4644,6 @@ do_more:
                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-                if (test_opt(sb, DISCARD))
-                        ext4_issue_discard(sb, block_group, bit, count);
        }
        ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4685,7 +4683,190 @@ error_return:
                dquot_free_block(inode, freed);
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
-        if (ac)
-                kmem_cache_free(ext4_ac_cachep, ac);
        return;
 }
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb:         super block for the file system
+ * @start:      starting block of the free extent in the alloc. group
+ * @count:      number of blocks to TRIM
+ * @group:      alloc. group we are working with
+ * @e4b:        ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
+                ext4_group_t group, struct ext4_buddy *e4b)
+{
+        struct ext4_free_extent ex;
+        int ret = 0;
+        assert_spin_locked(ext4_group_lock_ptr(sb, group));
+        ex.fe_start = start;
+        ex.fe_group = group;
+        ex.fe_len = count;
+        /*
+         * Mark blocks used, so no one can reuse them while
+         * being trimmed.
+         */
+        mb_mark_used(e4b, &ex);
+        ext4_unlock_group(sb, group);
+        ret = ext4_issue_discard(sb, group, start, count);
+        if (ret)
+                ext4_std_error(sb, ret);
+        ext4_lock_group(sb, group);
+        mb_free_blocks(NULL, e4b, start, ex.fe_len);
+        return ret;
+}
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:                 super block for file system
+ * @e4b:                ext4 buddy
+ * @start:              first group block to examine
+ * @max:                last group block to examine
+ * @minblocks:          minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+                ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+{
+        void *bitmap;
+        ext4_grpblk_t next, count = 0;
+        ext4_group_t group;
+        int ret = 0;
+        BUG_ON(e4b == NULL);
+        bitmap = e4b->bd_bitmap;
+        group = e4b->bd_group;
+        start = (e4b->bd_info->bb_first_free > start) ?
+                e4b->bd_info->bb_first_free : start;
+        ext4_lock_group(sb, group);
+        while (start < max) {
+                start = mb_find_next_zero_bit(bitmap, max, start);
+                if (start >= max)
+                        break;
+                next = mb_find_next_bit(bitmap, max, start);
+                if ((next - start) >= minblocks) {
+                        ret = ext4_trim_extent(sb, start,
+                                next - start, group, e4b);
+                        if (ret < 0)
+                                break;
+                        count += next - start;
+                }
+                start = next + 1;
+                if (fatal_signal_pending(current)) {
+                        count = -ERESTARTSYS;
+                        break;
+                }
+                if (need_resched()) {
+                        ext4_unlock_group(sb, group);
+                        cond_resched();
+                        ext4_lock_group(sb, group);
+                }
+                if ((e4b->bd_info->bb_free - count) < minblocks)
+                        break;
+        }
+        ext4_unlock_group(sb, group);
+        ext4_debug("trimmed %d blocks in the group %d\n",
+                count, group);
+        if (ret < 0)
+                count = ret;
+        return count;
+}
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb:                 superblock for filesystem
+ * @range:              fstrim_range structure
+ *
+ * start:       First Byte to trim
+ * len:         number of Bytes to trim from start
+ * minlen:      minimum extent length in Bytes
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+        struct ext4_buddy e4b;
+        ext4_group_t first_group, last_group;
+        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+        ext4_grpblk_t cnt = 0, first_block, last_block;
+        uint64_t start, len, minlen, trimmed;
+        int ret = 0;
+        start = range->start >> sb->s_blocksize_bits;
+        len = range->len >> sb->s_blocksize_bits;
+        minlen = range->minlen >> sb->s_blocksize_bits;
+        trimmed = 0;
+        if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+                return -EINVAL;
+        /* Determine first and last group to examine based on start and len */
+        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+                                     &first_group, &first_block);
+        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+                                     &last_group, &last_block);
+        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+        last_block = EXT4_BLOCKS_PER_GROUP(sb);
+        if (first_group > last_group)
+                return -EINVAL;
+        for (group = first_group; group <= last_group; group++) {
+                ret = ext4_mb_load_buddy(sb, group, &e4b);
+                if (ret) {
+                        ext4_error(sb, "Error in loading buddy "
+                                        "information for %u", group);
+                        break;
+                }
+                if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+                        len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+                else
+                        last_block = len;
+                if (e4b.bd_info->bb_free >= minlen) {
+                        cnt = ext4_trim_all_free(sb, &e4b, first_block,
+                                                last_block, minlen);
+                        if (cnt < 0) {
+                                ret = cnt;
+                                ext4_mb_unload_buddy(&e4b);
+                                break;
+                        }
+                }
+                ext4_mb_unload_buddy(&e4b);
+                trimmed += cnt;
+                first_block = 0;
+        }
+        range->len = trimmed * sb->s_blocksize;
+        return ret;
+}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 1765c2c50a9b..25f3a974b725 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -412,7 +412,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
        struct buffer_head *bh;
        struct ext4_extent_header *eh;
-        block = idx_pblock(ix);
+        block = ext4_idx_pblock(ix);
        bh = sb_bread(inode->i_sb, block);
        if (!bh)
                return -EIO;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5f1ed9fc913c..b9f3e7862f13 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -85,7 +85,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
        if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
                /* leaf block */
                *extent = ++path[ppos].p_ext;
-                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
                return 0;
        }
@@ -96,7 +96,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                        /* index block */
                        path[ppos].p_idx++;
-                        path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+                        path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                        if (path[ppos+1].p_bh)
                                brelse(path[ppos+1].p_bh);
                        path[ppos+1].p_bh =
@@ -111,7 +111,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                                path[cur_ppos].p_idx =
                                        EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
                                path[cur_ppos].p_block =
-                                        idx_pblock(path[cur_ppos].p_idx);
+                                        ext4_idx_pblock(path[cur_ppos].p_idx);
                                if (path[cur_ppos+1].p_bh)
                                        brelse(path[cur_ppos+1].p_bh);
                                path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
@@ -133,7 +133,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                        path[leaf_ppos].p_ext = *extent =
                                EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
                        path[leaf_ppos].p_block =
-                                        ext_pblock(path[leaf_ppos].p_ext);
+                                        ext4_ext_pblock(path[leaf_ppos].p_ext);
                        return 0;
                }
        }
@@ -249,7 +249,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                         */
                        o_end->ee_block = end_ext->ee_block;
                        o_end->ee_len = end_ext->ee_len;
-                        ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+                        ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
                }
                o_start->ee_len = start_ext->ee_len;
@@ -276,7 +276,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                 */
                o_end->ee_block = end_ext->ee_block;
                o_end->ee_len = end_ext->ee_len;
-                ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+                ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
                /*
                 * Set 0 to the extent block if new_ext was
@@ -361,7 +361,7 @@ mext_insert_inside_block(struct ext4_extent *o_start,
        /* Insert new entry */
        if (new_ext->ee_len) {
                o_start[i] = *new_ext;
-                ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+                ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
        }
        /* Insert end entry */
@@ -488,7 +488,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        start_ext.ee_len = end_ext.ee_len = 0;
        new_ext.ee_block = cpu_to_le32(*from);
-        ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+        ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
        new_ext.ee_len = dext->ee_len;
        new_ext_alen = ext4_ext_get_actual_len(&new_ext);
        new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
@@ -553,7 +553,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
                copy_extent_status(oext, &end_ext);
                end_ext_alen = ext4_ext_get_actual_len(&end_ext);
                ext4_ext_store_pblock(&end_ext,
-                        (ext_pblock(o_end) + oext_alen - end_ext_alen));
+                        (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
                end_ext.ee_block =
                        cpu_to_le32(le32_to_cpu(o_end->ee_block) +
                        oext_alen - end_ext_alen);
@@ -604,7 +604,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
        /* When tmp_dext is too large, pick up the target range. */
        diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
-        ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
+        ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
        tmp_dext->ee_block =
                        cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
        tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
@@ -613,7 +613,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
                tmp_dext->ee_len = cpu_to_le16(max_count);
        orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
-        ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
+        ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
        /* Adjust extent length if donor extent is larger than orig */
        if (ext4_ext_get_actual_len(tmp_dext) >
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 314c0d3b3fa9..92203b8a099f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -856,6 +856,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        struct buffer_head *bh_use[NAMEI_RA_SIZE];
        struct buffer_head *bh, *ret = NULL;
        ext4_lblk_t start, block, b;
+        const u8 *name = d_name->name;
        int ra_max = 0;         /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        int ra_ptr = 0;         /* Current index into readahead
@@ -870,6 +871,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        namelen = d_name->len;
        if (namelen > EXT4_NAME_LEN)
                return NULL;
+        if ((namelen <= 2) && (name[0] == '.') &&
+            (name[1] == '.' || name[1] == '0')) {
+                /*
+                 * "." or ".." will only be in the first block
+                 * NFS may look up ".."; "." should be handled by the VFS
+                 */
+                block = start = 0;
+                nblocks = 1;
+                goto restart;
+        }
        if (is_dx(dir)) {
                bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
                /*
@@ -960,55 +971,35 @@ cleanup_and_exit:
 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
                       struct ext4_dir_entry_2 **res_dir, int *err)
 {
-        struct super_block * sb;
+        struct super_block * sb = dir->i_sb;
        struct dx_hash_info     hinfo;
-        u32 hash;
        struct dx_frame frames[2], *frame;
-        struct ext4_dir_entry_2 *de, *top;
        struct buffer_head *bh;
        ext4_lblk_t block;
        int retval;
-        int namelen = d_name->len;
-        const u8 *name = d_name->name;
-        sb = dir->i_sb;
+        if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-        /* NFS may look up ".." - look at dx_root directory block */
+                return NULL;
-        if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-                if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-                        return NULL;
-        } else {
-                frame = frames;
-                frame->bh = NULL;                       /* for dx_release() */
-                frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
-                dx_set_block(frame->at, 0);             /* dx_root block is 0 */
-        }
-        hash = hinfo.hash;
        do {
                block = dx_get_block(frame->at);
-                if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+                if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
                        goto errout;
-                de = (struct ext4_dir_entry_2 *) bh->b_data;
-                top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-                                       EXT4_DIR_REC_LEN(0));
-                for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
-                        int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
-                                  + ((char *) de - bh->b_data);
-                        if (!ext4_check_dir_entry(dir, de, bh, off)) {
-                                brelse(bh);
-                                *err = ERR_BAD_DX_DIR;
-                                goto errout;
-                        }
-                        if (ext4_match(namelen, name, de)) {
+                retval = search_dirblock(bh, dir, d_name,
-                                *res_dir = de;
+                                         block << EXT4_BLOCK_SIZE_BITS(sb),
-                                dx_release(frames);
+                                         res_dir);
-                                return bh;
+                if (retval == 1) {      /* Success! */
-                        }
+                        dx_release(frames);
+                        return bh;
                }
                brelse(bh);
+                if (retval == -1) {
+                        *err = ERR_BAD_DX_DIR;
+                        goto errout;
+                }
                /* Check to see if we should continue to search */
-                retval = ext4_htree_next_block(dir, hash, frame,
+                retval = ext4_htree_next_block(dir, hinfo.hash, frame,
                                               frames, NULL);
                if (retval < 0) {
                        ext4_warning(sb,
@@ -2312,7 +2303,7 @@ retry:
        inode->i_ctime = ext4_current_time(inode);
        ext4_inc_count(handle, inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = ext4_add_entry(handle, dentry, inode);
        if (!err) {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 000000000000..7f5451cd1d38
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,431 @@
+/*
+ * linux/fs/ext4/page-io.c
+ *
+ * This contains the new page_io functions for ext4
+ *
+ * Written by Theodore Ts'o, 2010.
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "ext4_extents.h"
+static struct kmem_cache *io_page_cachep, *io_end_cachep;
+#define WQ_HASH_SZ              37
+#define to_ioend_wq(v)  (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
+static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
+int __init ext4_init_pageio(void)
+{
+        int i;
+        io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
+        if (io_page_cachep == NULL)
+                return -ENOMEM;
+        io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
+        if (io_page_cachep == NULL) {
+                kmem_cache_destroy(io_page_cachep);
+                return -ENOMEM;
+        }
+        for (i = 0; i < WQ_HASH_SZ; i++)
+                init_waitqueue_head(&ioend_wq[i]);
+        return 0;
+}
+void ext4_exit_pageio(void)
+{
+        kmem_cache_destroy(io_end_cachep);
+        kmem_cache_destroy(io_page_cachep);
+}
+void ext4_ioend_wait(struct inode *inode)
+{
+        wait_queue_head_t *wq = to_ioend_wq(inode);
+        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+}
+static void put_io_page(struct ext4_io_page *io_page)
+{
+        if (atomic_dec_and_test(&io_page->p_count)) {
+                end_page_writeback(io_page->p_page);
+                put_page(io_page->p_page);
+                kmem_cache_free(io_page_cachep, io_page);
+        }
+}
+void ext4_free_io_end(ext4_io_end_t *io)
+{
+        int i;
+        wait_queue_head_t *wq;
+        BUG_ON(!io);
+        if (io->page)
+                put_page(io->page);
+        for (i = 0; i < io->num_io_pages; i++)
+                put_io_page(io->pages[i]);
+        io->num_io_pages = 0;
+        wq = to_ioend_wq(io->inode);
+        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
+            waitqueue_active(wq))
+                wake_up_all(wq);
+        kmem_cache_free(io_end_cachep, io);
+}
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+int ext4_end_io_nolock(ext4_io_end_t *io)
+{
+        struct inode *inode = io->inode;
+        loff_t offset = io->offset;
+        ssize_t size = io->size;
+        int ret = 0;
+        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+                   "list->prev 0x%p\n",
+                   io, inode->i_ino, io->list.next, io->list.prev);
+        if (list_empty(&io->list))
+                return ret;
+        if (!(io->flag & EXT4_IO_END_UNWRITTEN))
+                return ret;
+        ret = ext4_convert_unwritten_extents(inode, offset, size);
+        if (ret < 0) {
+                printk(KERN_EMERG "%s: failed to convert unwritten "
+                        "extents to written extents, error is %d "
+                        "io is still on inode %lu aio dio list\n",
+                       __func__, ret, inode->i_ino);
+                return ret;
+        }
+        if (io->iocb)
+                aio_complete(io->iocb, io->result, 0);
+        /* clear the DIO AIO unwritten flag */
+        io->flag &= ~EXT4_IO_END_UNWRITTEN;
+        return ret;
+}
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+        ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
+        struct inode            *inode = io->inode;
+        struct ext4_inode_info  *ei = EXT4_I(inode);
+        unsigned long           flags;
+        int                     ret;
+        mutex_lock(&inode->i_mutex);
+        ret = ext4_end_io_nolock(io);
+        if (ret < 0) {
+                mutex_unlock(&inode->i_mutex);
+                return;
+        }
+        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        if (!list_empty(&io->list))
+                list_del_init(&io->list);
+        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        mutex_unlock(&inode->i_mutex);
+        ext4_free_io_end(io);
+}
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+        ext4_io_end_t *io = NULL;
+        io = kmem_cache_alloc(io_end_cachep, flags);
+        if (io) {
+                memset(io, 0, sizeof(*io));
+                atomic_inc(&EXT4_I(inode)->i_ioend_count);
+                io->inode = inode;
+                INIT_WORK(&io->work, ext4_end_io_work);
+                INIT_LIST_HEAD(&io->list);
+        }
+        return io;
+}
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+        char b[BDEVNAME_SIZE];
+        printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+                        bdevname(bh->b_bdev, b),
+                        (unsigned long long)bh->b_blocknr);
+}
+static void ext4_end_bio(struct bio *bio, int error)
+{
+        ext4_io_end_t *io_end = bio->bi_private;
+        struct workqueue_struct *wq;
+        struct inode *inode;
+        unsigned long flags;
+        int i;
+        BUG_ON(!io_end);
+        bio->bi_private = NULL;
+        bio->bi_end_io = NULL;
+        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+                error = 0;
+        bio_put(bio);
+        for (i = 0; i < io_end->num_io_pages; i++) {
+                struct page *page = io_end->pages[i]->p_page;
+                struct buffer_head *bh, *head;
+                int partial_write = 0;
+                head = page_buffers(page);
+                if (error)
+                        SetPageError(page);
+                BUG_ON(!head);
+                if (head->b_size == PAGE_CACHE_SIZE)
+                        clear_buffer_dirty(head);
+                else {
+                        loff_t offset;
+                        loff_t io_end_offset = io_end->offset + io_end->size;
+                        offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
+                        bh = head;
+                        do {
+                                if ((offset >= io_end->offset) &&
+                                    (offset+bh->b_size <= io_end_offset)) {
+                                        if (error)
+                                                buffer_io_error(bh);
+                                        clear_buffer_dirty(bh);
+                                }
+                                if (buffer_delay(bh))
+                                        partial_write = 1;
+                                else if (!buffer_mapped(bh))
+                                        clear_buffer_dirty(bh);
+                                else if (buffer_dirty(bh))
+                                        partial_write = 1;
+                                offset += bh->b_size;
+                                bh = bh->b_this_page;
+                        } while (bh != head);
+                }
+                put_io_page(io_end->pages[i]);
+                /*
+                 * If this is a partial write which happened to make
+                 * all buffers uptodate then we can optimize away a
+                 * bogus readpage() for the next read(). Here we
+                 * 'discover' whether the page went uptodate as a
+                 * result of this (potentially partial) write.
+                 */
+                if (!partial_write)
+                        SetPageUptodate(page);
+        }
+        io_end->num_io_pages = 0;
+        inode = io_end->inode;
+        if (error) {
+                io_end->flag |= EXT4_IO_END_ERROR;
+                ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+                             "(offset %llu size %ld starting block %llu)",
+                             inode->i_ino,
+                             (unsigned long long) io_end->offset,
+                             (long) io_end->size,
+                             (unsigned long long)
+                             bio->bi_sector >> (inode->i_blkbits - 9));
+        }
+        /* Add the io_end to per-inode completed io list*/
+        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+        list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+        wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+        /* queue the work to convert unwritten extents to written */
+        queue_work(wq, &io_end->work);
+}
+void ext4_io_submit(struct ext4_io_submit *io)
+{
+        struct bio *bio = io->io_bio;
+        if (bio) {
+                bio_get(io->io_bio);
+                submit_bio(io->io_op, io->io_bio);
+                BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+                bio_put(io->io_bio);
+        }
+        io->io_bio = 0;
+        io->io_op = 0;
+        io->io_end = 0;
+}
+static int io_submit_init(struct ext4_io_submit *io,
+                          struct inode *inode,
+                          struct writeback_control *wbc,
+                          struct buffer_head *bh)
+{
+        ext4_io_end_t *io_end;
+        struct page *page = bh->b_page;
+        int nvecs = bio_get_nr_vecs(bh->b_bdev);
+        struct bio *bio;
+        io_end = ext4_init_io_end(inode, GFP_NOFS);
+        if (!io_end)
+                return -ENOMEM;
+        do {
+                bio = bio_alloc(GFP_NOIO, nvecs);
+                nvecs >>= 1;
+        } while (bio == NULL);
+        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+        bio->bi_bdev = bh->b_bdev;
+        bio->bi_private = io->io_end = io_end;
+        bio->bi_end_io = ext4_end_bio;
+        io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
+        io->io_bio = bio;
+        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
+                        WRITE_SYNC_PLUG : WRITE);
+        io->io_next_block = bh->b_blocknr;
+        return 0;
+}
+static int io_submit_add_bh(struct ext4_io_submit *io,
+                            struct ext4_io_page *io_page,
+                            struct inode *inode,
+                            struct writeback_control *wbc,
+                            struct buffer_head *bh)
+{
+        ext4_io_end_t *io_end;
+        int ret;
+        if (buffer_new(bh)) {
+                clear_buffer_new(bh);
+                unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+        }
+        if (!buffer_mapped(bh) || buffer_delay(bh)) {
+                if (!buffer_mapped(bh))
+                        clear_buffer_dirty(bh);
+                if (io->io_bio)
+                        ext4_io_submit(io);
+                return 0;
+        }
+        if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+submit_and_retry:
+                ext4_io_submit(io);
+        }
+        if (io->io_bio == NULL) {
+                ret = io_submit_init(io, inode, wbc, bh);
+                if (ret)
+                        return ret;
+        }
+        io_end = io->io_end;
+        if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
+            (io_end->pages[io_end->num_io_pages-1] != io_page))
+                goto submit_and_retry;
+        if (buffer_uninit(bh))
+                io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
+        io->io_end->size += bh->b_size;
+        io->io_next_block++;
+        ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+        if (ret != bh->b_size)
+                goto submit_and_retry;
+        if ((io_end->num_io_pages == 0) ||
+            (io_end->pages[io_end->num_io_pages-1] != io_page)) {
+                io_end->pages[io_end->num_io_pages++] = io_page;
+                atomic_inc(&io_page->p_count);
+        }
+        return 0;
+}
+int ext4_bio_write_page(struct ext4_io_submit *io,
+                        struct page *page,
+                        int len,
+                        struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        unsigned block_start, block_end, blocksize;
+        struct ext4_io_page *io_page;
+        struct buffer_head *bh, *head;
+        int ret = 0;
+        blocksize = 1 << inode->i_blkbits;
+        BUG_ON(PageWriteback(page));
+        set_page_writeback(page);
+        ClearPageError(page);
+        io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
+        if (!io_page) {
+                set_page_dirty(page);
+                unlock_page(page);
+                return -ENOMEM;
+        }
+        io_page->p_page = page;
+        atomic_set(&io_page->p_count, 1);
+        get_page(page);
+        for (bh = head = page_buffers(page), block_start = 0;
+             bh != head || !block_start;
+             block_start = block_end, bh = bh->b_this_page) {
+                block_end = block_start + blocksize;
+                if (block_start >= len) {
+                        clear_buffer_dirty(bh);
+                        set_buffer_uptodate(bh);
+                        continue;
+                }
+                ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+                if (ret) {
+                        /*
+                         * We only get here on ENOMEM.  Not much else
+                         * we can do but mark the page as dirty, and
+                         * better luck next time.
+                         */
+                        set_page_dirty(page);
+                        break;
+                }
+        }
+        unlock_page(page);
+        /*
+         * If the page was truncated before we could do the writeback,
+         * or we had a memory allocation error while trying to write
+         * the first buffer head, we won't have submitted any pages for
+         * I/O.  In that case we need to make sure we've cleared the
+         * PageWriteback bit from the page to prevent the system from
+         * wedging later on.
+         */
+        put_io_page(io_page);
+        return ret;
+}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ca5c8aa00a2f..dc963929de65 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -226,23 +226,13 @@ static int setup_new_group_blocks(struct super_block *sb,
        }
        /* Zero out all of the reserved backup group descriptor table blocks */
-        for (i = 0, bit = gdblocks + 1, block = start + bit;
+        ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
-             i < reserved_gdb; i++, block++, bit++) {
+                        block, sbi->s_itb_per_group);
-                struct buffer_head *gdb;
+        err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
+                               GFP_NOFS);
-                ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
+        if (err)
+                goto exit_bh;
-                if ((err = extend_or_restart_transaction(handle, 1, bh)))
-                        goto exit_bh;
-                if (IS_ERR(gdb = bclean(handle, sb, block))) {
-                        err = PTR_ERR(gdb);
-                        goto exit_bh;
-                }
-                ext4_handle_dirty_metadata(handle, NULL, gdb);
-                ext4_set_bit(bit, bh->b_data);
-                brelse(gdb);
-        }
        ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
                   input->block_bitmap - start);
        ext4_set_bit(input->block_bitmap - start, bh->b_data);
@@ -251,28 +241,18 @@ static int setup_new_group_blocks(struct super_block *sb,
        ext4_set_bit(input->inode_bitmap - start, bh->b_data);
        /* Zero out all of the inode table blocks */
-        for (i = 0, block = input->inode_table, bit = block - start;
+        block = input->inode_table;
-             i < sbi->s_itb_per_group; i++, bit++, block++) {
+        ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
-                struct buffer_head *it;
+                        block, sbi->s_itb_per_group);
+        err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
-                ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
+        if (err)
+                goto exit_bh;
-                if ((err = extend_or_restart_transaction(handle, 1, bh)))
-                        goto exit_bh;
-                if (IS_ERR(it = bclean(handle, sb, block))) {
-                        err = PTR_ERR(it);
-                        goto exit_bh;
-                }
-                ext4_handle_dirty_metadata(handle, NULL, it);
-                brelse(it);
-                ext4_set_bit(bit, bh->b_data);
-        }
        if ((err = extend_or_restart_transaction(handle, 2, bh)))
                goto exit_bh;
-        mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
+        ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
+                             bh->b_data);
        ext4_handle_dirty_metadata(handle, NULL, bh);
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
@@ -283,8 +263,8 @@ static int setup_new_group_blocks(struct super_block *sb,
                goto exit_journal;
        }
-        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
-                        bh->b_data);
+                             bh->b_data);
        ext4_handle_dirty_metadata(handle, NULL, bh);
 exit_bh:
        brelse(bh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7f47c366bf15..61182fe6254e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -26,7 +26,6 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/vfs.h>
@@ -41,6 +40,9 @@
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -50,8 +52,11 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
-struct proc_dir_entry *ext4_proc_root;
+static struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
+struct ext4_lazy_init *ext4_li_info;
+struct mutex ext4_li_mtx;
+struct ext4_features *ext4_feat;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
@@ -68,14 +73,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
-static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data, struct vfsmount *mnt);
+                       const char *dev_name, void *data);
+static void ext4_destroy_lazyinit_thread(void);
+static void ext4_unregister_li_request(struct super_block *sb);
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext3",
-        .get_sb         = ext4_get_sb,
+        .mount          = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -702,13 +709,13 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
+        ext4_unregister_li_request(sb);
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        flush_workqueue(sbi->dio_unwritten_wq);
        destroy_workqueue(sbi->dio_unwritten_wq);
        lock_super(sb);
-        lock_kernel();
        if (sb->s_dirt)
                ext4_commit_super(sb, 1);
@@ -719,6 +726,7 @@ static void ext4_put_super(struct super_block *sb)
                        ext4_abort(sb, "Couldn't clean up the journal");
        }
+        del_timer(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);
@@ -775,7 +783,6 @@ static void ext4_put_super(struct super_block *sb)
         * Now that we are completely done shutting down the
         * superblock, we need to actually destroy the kobject.
         */
-        unlock_kernel();
        unlock_super(sb);
        kobject_put(&sbi->s_kobj);
        wait_for_completion(&sbi->s_kobj_unregister);
@@ -821,12 +828,22 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->cur_aio_dio = NULL;
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
+        atomic_set(&ei->i_ioend_count, 0);
        return &ei->vfs_inode;
 }
+static int ext4_drop_inode(struct inode *inode)
+{
+        int drop = generic_drop_inode(inode);
+        trace_ext4_drop_inode(inode, drop);
+        return drop;
+}
 static void ext4_destroy_inode(struct inode *inode)
 {
+        ext4_ioend_wait(inode);
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): orphan list check failed!",
@@ -1045,6 +1062,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
                seq_puts(seq, ",block_validity");
+        if (!test_opt(sb, INIT_INODE_TABLE))
+                seq_puts(seq, ",noinit_inode_table");
+        else if (sbi->s_li_wait_mult)
+                seq_printf(seq, ",init_inode_table=%u",
+                           (unsigned) sbi->s_li_wait_mult);
        ext4_show_quota_options(seq, sb);
        return 0;
@@ -1160,6 +1183,7 @@ static const struct super_operations ext4_sops = {
        .destroy_inode  = ext4_destroy_inode,
        .write_inode    = ext4_write_inode,
        .dirty_inode    = ext4_dirty_inode,
+        .drop_inode     = ext4_drop_inode,
        .evict_inode    = ext4_evict_inode,
        .put_super      = ext4_put_super,
        .sync_fs        = ext4_sync_fs,
@@ -1173,6 +1197,7 @@ static const struct super_operations ext4_sops = {
        .quota_write    = ext4_quota_write,
 #endif
        .bdev_try_to_free_page = bdev_try_to_free_page,
+        .trim_fs        = ext4_trim_fs
 };
 static const struct super_operations ext4_nojournal_sops = {
@@ -1180,6 +1205,7 @@ static const struct super_operations ext4_nojournal_sops = {
        .destroy_inode  = ext4_destroy_inode,
        .write_inode    = ext4_write_inode,
        .dirty_inode    = ext4_dirty_inode,
+        .drop_inode     = ext4_drop_inode,
        .evict_inode    = ext4_evict_inode,
        .write_super    = ext4_write_super,
        .put_super      = ext4_put_super,
@@ -1219,6 +1245,7 @@ enum {
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard,
+        Opt_init_inode_table, Opt_noinit_inode_table,
 };
 static const match_table_t tokens = {
@@ -1289,6 +1316,9 @@ static const match_table_t tokens = {
        {Opt_dioread_lock, "dioread_lock"},
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
+        {Opt_init_inode_table, "init_itable=%u"},
+        {Opt_init_inode_table, "init_itable"},
+        {Opt_noinit_inode_table, "noinit_itable"},
        {Opt_err, NULL},
 };
@@ -1759,6 +1789,20 @@ set_qf_format:
                case Opt_dioread_lock:
                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
                        break;
+                case Opt_init_inode_table:
+                        set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                        if (args[0].from) {
+                                if (match_int(&args[0], &option))
+                                        return 0;
+                        } else
+                                option = EXT4_DEF_LI_WAIT_MULT;
+                        if (option < 0)
+                                return 0;
+                        sbi->s_li_wait_mult = option;
+                        break;
+                case Opt_noinit_inode_table:
+                        clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                        break;
                default:
                        ext4_msg(sb, KERN_ERR,
                               "Unrecognized mount option \"%s\" "
@@ -1942,7 +1986,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
 }
 /* Called at mount-time, super-block is locked */
-static int ext4_check_descriptors(struct super_block *sb)
+static int ext4_check_descriptors(struct super_block *sb,
+                                  ext4_group_t *first_not_zeroed)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1951,7 +1996,7 @@ static int ext4_check_descriptors(struct super_block *sb)
        ext4_fsblk_t inode_bitmap;
        ext4_fsblk_t inode_table;
        int flexbg_flag = 0;
-        ext4_group_t i;
+        ext4_group_t i, grp = sbi->s_groups_count;
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                flexbg_flag = 1;
@@ -1967,6 +2012,10 @@ static int ext4_check_descriptors(struct super_block *sb)
                        last_block = first_block +
                                (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+                if ((grp == sbi->s_groups_count) &&
+                   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                        grp = i;
                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -2004,6 +2053,8 @@ static int ext4_check_descriptors(struct super_block *sb)
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
+        if (NULL != first_not_zeroed)
+                *first_not_zeroed = grp;
        ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
        sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@ -2376,6 +2427,7 @@ static struct ext4_attr ext4_attr_##_name = {			\
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
 #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
 #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)       \
@@ -2412,6 +2464,16 @@ static struct attribute *ext4_attrs[] = {
        NULL,
 };
+/* Features this copy of ext4 supports */
+EXT4_INFO_ATTR(lazy_itable_init);
+EXT4_INFO_ATTR(batched_discard);
+static struct attribute *ext4_feat_attrs[] = {
+        ATTR_LIST(lazy_itable_init),
+        ATTR_LIST(batched_discard),
+        NULL,
+};
 static ssize_t ext4_attr_show(struct kobject *kobj,
                              struct attribute *attr, char *buf)
 {
@@ -2440,7 +2502,6 @@ static void ext4_sb_release(struct kobject *kobj)
        complete(&sbi->s_kobj_unregister);
 }
 static const struct sysfs_ops ext4_attr_ops = {
        .show   = ext4_attr_show,
        .store  = ext4_attr_store,
@@ -2452,6 +2513,17 @@ static struct kobj_type ext4_ktype = {
        .release        = ext4_sb_release,
 };
+static void ext4_feat_release(struct kobject *kobj)
+{
+        complete(&ext4_feat->f_kobj_unregister);
+}
+static struct kobj_type ext4_feat_ktype = {
+        .default_attrs  = ext4_feat_attrs,
+        .sysfs_ops      = &ext4_attr_ops,
+        .release        = ext4_feat_release,
+};
 /*
 * Check whether this filesystem can be mounted based on
 * the features present and the RDONLY/RDWR mount requested.
@@ -2542,6 +2614,371 @@ static void print_daily_error_info(unsigned long arg)
        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
 }
+static void ext4_lazyinode_timeout(unsigned long data)
+{
+        struct task_struct *p = (struct task_struct *)data;
+        wake_up_process(p);
+}
+/* Find next suitable group and run ext4_init_inode_table */
+static int ext4_run_li_request(struct ext4_li_request *elr)
+{
+        struct ext4_group_desc *gdp = NULL;
+        ext4_group_t group, ngroups;
+        struct super_block *sb;
+        unsigned long timeout = 0;
+        int ret = 0;
+        sb = elr->lr_super;
+        ngroups = EXT4_SB(sb)->s_groups_count;
+        for (group = elr->lr_next_group; group < ngroups; group++) {
+                gdp = ext4_get_group_desc(sb, group, NULL);
+                if (!gdp) {
+                        ret = 1;
+                        break;
+                }
+                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                        break;
+        }
+        if (group == ngroups)
+                ret = 1;
+        if (!ret) {
+                timeout = jiffies;
+                ret = ext4_init_inode_table(sb, group,
+                                            elr->lr_timeout ? 0 : 1);
+                if (elr->lr_timeout == 0) {
+                        timeout = jiffies - timeout;
+                        if (elr->lr_sbi->s_li_wait_mult)
+                                timeout *= elr->lr_sbi->s_li_wait_mult;
+                        else
+                                timeout *= 20;
+                        elr->lr_timeout = timeout;
+                }
+                elr->lr_next_sched = jiffies + elr->lr_timeout;
+                elr->lr_next_group = group + 1;
+        }
+        return ret;
+}
+/*
+ * Remove lr_request from the list_request and free the
+ * request tructure. Should be called with li_list_mtx held
+ */
+static void ext4_remove_li_request(struct ext4_li_request *elr)
+{
+        struct ext4_sb_info *sbi;
+        if (!elr)
+                return;
+        sbi = elr->lr_sbi;
+        list_del(&elr->lr_request);
+        sbi->s_li_request = NULL;
+        kfree(elr);
+}
+static void ext4_unregister_li_request(struct super_block *sb)
+{
+        struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
+        if (!ext4_li_info)
+                return;
+        mutex_lock(&ext4_li_info->li_list_mtx);
+        ext4_remove_li_request(elr);
+        mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+/*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+ * When such a fs is found, run the lazy initialization request
+ * (ext4_rn_li_request) and keep track of the time spend in this
+ * function. Based on that time we compute next schedule time of
+ * the request. When walking through the list is complete, compute
+ * next waking time and put itself into sleep.
+ */
+static int ext4_lazyinit_thread(void *arg)
+{
+        struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+        struct list_head *pos, *n;
+        struct ext4_li_request *elr;
+        unsigned long next_wakeup;
+        DEFINE_WAIT(wait);
+        BUG_ON(NULL == eli);
+        eli->li_timer.data = (unsigned long)current;
+        eli->li_timer.function = ext4_lazyinode_timeout;
+        eli->li_task = current;
+        wake_up(&eli->li_wait_task);
+cont_thread:
+        while (true) {
+                next_wakeup = MAX_JIFFY_OFFSET;
+                mutex_lock(&eli->li_list_mtx);
+                if (list_empty(&eli->li_request_list)) {
+                        mutex_unlock(&eli->li_list_mtx);
+                        goto exit_thread;
+                }
+                list_for_each_safe(pos, n, &eli->li_request_list) {
+                        elr = list_entry(pos, struct ext4_li_request,
+                                         lr_request);
+                        if (time_after_eq(jiffies, elr->lr_next_sched)) {
+                                if (ext4_run_li_request(elr) != 0) {
+                                        /* error, remove the lazy_init job */
+                                        ext4_remove_li_request(elr);
+                                        continue;
+                                }
+                        }
+                        if (time_before(elr->lr_next_sched, next_wakeup))
+                                next_wakeup = elr->lr_next_sched;
+                }
+                mutex_unlock(&eli->li_list_mtx);
+                if (freezing(current))
+                        refrigerator();
+                if ((time_after_eq(jiffies, next_wakeup)) ||
+                    (MAX_JIFFY_OFFSET == next_wakeup)) {
+                        cond_resched();
+                        continue;
+                }
+                eli->li_timer.expires = next_wakeup;
+                add_timer(&eli->li_timer);
+                prepare_to_wait(&eli->li_wait_daemon, &wait,
+                                TASK_INTERRUPTIBLE);
+                if (time_before(jiffies, next_wakeup))
+                        schedule();
+                finish_wait(&eli->li_wait_daemon, &wait);
+        }
+exit_thread:
+        /*
+         * It looks like the request list is empty, but we need
+         * to check it under the li_list_mtx lock, to prevent any
+         * additions into it, and of course we should lock ext4_li_mtx
+         * to atomically free the list and ext4_li_info, because at
+         * this point another ext4 filesystem could be registering
+         * new one.
+         */
+        mutex_lock(&ext4_li_mtx);
+        mutex_lock(&eli->li_list_mtx);
+        if (!list_empty(&eli->li_request_list)) {
+                mutex_unlock(&eli->li_list_mtx);
+                mutex_unlock(&ext4_li_mtx);
+                goto cont_thread;
+        }
+        mutex_unlock(&eli->li_list_mtx);
+        del_timer_sync(&ext4_li_info->li_timer);
+        eli->li_task = NULL;
+        wake_up(&eli->li_wait_task);
+        kfree(ext4_li_info);
+        ext4_li_info = NULL;
+        mutex_unlock(&ext4_li_mtx);
+        return 0;
+}
+static void ext4_clear_request_list(void)
+{
+        struct list_head *pos, *n;
+        struct ext4_li_request *elr;
+        mutex_lock(&ext4_li_info->li_list_mtx);
+        if (list_empty(&ext4_li_info->li_request_list))
+                return;
+        list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+                elr = list_entry(pos, struct ext4_li_request,
+                                 lr_request);
+                ext4_remove_li_request(elr);
+        }
+        mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+static int ext4_run_lazyinit_thread(void)
+{
+        struct task_struct *t;
+        t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+        if (IS_ERR(t)) {
+                int err = PTR_ERR(t);
+                ext4_clear_request_list();
+                del_timer_sync(&ext4_li_info->li_timer);
+                kfree(ext4_li_info);
+                ext4_li_info = NULL;
+                printk(KERN_CRIT "EXT4: error %d creating inode table "
+                                 "initialization thread\n",
+                                 err);
+                return err;
+        }
+        ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+        wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
+        return 0;
+}
+/*
+ * Check whether it make sense to run itable init. thread or not.
+ * If there is at least one uninitialized inode table, return
+ * corresponding group number, else the loop goes through all
+ * groups and return total number of groups.
+ */
+static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+{
+        ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+        struct ext4_group_desc *gdp = NULL;
+        for (group = 0; group < ngroups; group++) {
+                gdp = ext4_get_group_desc(sb, group, NULL);
+                if (!gdp)
+                        continue;
+                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                        break;
+        }
+        return group;
+}
+static int ext4_li_info_new(void)
+{
+        struct ext4_lazy_init *eli = NULL;
+        eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+        if (!eli)
+                return -ENOMEM;
+        eli->li_task = NULL;
+        INIT_LIST_HEAD(&eli->li_request_list);
+        mutex_init(&eli->li_list_mtx);
+        init_waitqueue_head(&eli->li_wait_daemon);
+        init_waitqueue_head(&eli->li_wait_task);
+        init_timer(&eli->li_timer);
+        eli->li_state |= EXT4_LAZYINIT_QUIT;
+        ext4_li_info = eli;
+        return 0;
+}
+static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+                                            ext4_group_t start)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_li_request *elr;
+        unsigned long rnd;
+        elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+        if (!elr)
+                return NULL;
+        elr->lr_super = sb;
+        elr->lr_sbi = sbi;
+        elr->lr_next_group = start;
+        /*
+         * Randomize first schedule time of the request to
+         * spread the inode table initialization requests
+         * better.
+         */
+        get_random_bytes(&rnd, sizeof(rnd));
+        elr->lr_next_sched = jiffies + (unsigned long)rnd %
+                             (EXT4_DEF_LI_MAX_START_DELAY * HZ);
+        return elr;
+}
+static int ext4_register_li_request(struct super_block *sb,
+                                    ext4_group_t first_not_zeroed)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_li_request *elr;
+        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+        int ret;
+        if (sbi->s_li_request != NULL)
+                return 0;
+        if (first_not_zeroed == ngroups ||
+            (sb->s_flags & MS_RDONLY) ||
+            !test_opt(sb, INIT_INODE_TABLE)) {
+                sbi->s_li_request = NULL;
+                return 0;
+        }
+        if (first_not_zeroed == ngroups) {
+                sbi->s_li_request = NULL;
+                return 0;
+        }
+        elr = ext4_li_request_new(sb, first_not_zeroed);
+        if (!elr)
+                return -ENOMEM;
+        mutex_lock(&ext4_li_mtx);
+        if (NULL == ext4_li_info) {
+                ret = ext4_li_info_new();
+                if (ret)
+                        goto out;
+        }
+        mutex_lock(&ext4_li_info->li_list_mtx);
+        list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+        mutex_unlock(&ext4_li_info->li_list_mtx);
+        sbi->s_li_request = elr;
+        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+                ret = ext4_run_lazyinit_thread();
+                if (ret)
+                        goto out;
+        }
+out:
+        mutex_unlock(&ext4_li_mtx);
+        if (ret)
+                kfree(elr);
+        return ret;
+}
+/*
+ * We do not need to lock anything since this is called on
+ * module unload.
+ */
+static void ext4_destroy_lazyinit_thread(void)
+{
+        /*
+         * If thread exited earlier
+         * there's nothing to be done.
+         */
+        if (!ext4_li_info)
+                return;
+        ext4_clear_request_list();
+        while (ext4_li_info->li_task) {
+                wake_up(&ext4_li_info->li_wait_daemon);
+                wait_event(ext4_li_info->li_wait_task,
+                           ext4_li_info->li_task == NULL);
+        }
+}
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
@@ -2567,6 +3004,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        __u64 blocks_count;
        int err;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+        ext4_group_t first_not_zeroed;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
@@ -2588,8 +3026,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                sbi->s_sectors_written_start =
                        part_stat_read(sb->s_bdev->bd_part, sectors[1]);
-        unlock_kernel();
        /* Cleanup superblock name */
        for (cp = sb->s_id; (cp = strchr(cp, '/'));)
                *cp = '!';
@@ -2629,6 +3065,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+        set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
                set_opt(sbi->s_mount_opt, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
@@ -2906,7 +3343,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        goto failed_mount2;
                }
        }
-        if (!ext4_check_descriptors(sb)) {
+        if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                goto failed_mount2;
        }
@@ -2922,6 +3359,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+                        ext4_count_free_blocks(sb));
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+                                ext4_count_free_inodes(sb));
+        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirs_counter,
+                                ext4_count_dirs(sb));
+        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        }
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "insufficient memory");
+                goto failed_mount3;
+        }
        sbi->s_stripe = ext4_get_stripe_size(sbi);
        sbi->s_max_writeback_mb_bump = 128;
@@ -3020,22 +3475,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
-no_journal:
+        /*
-        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+         * The journal may have updated the bg summary counts, so we
-                                  ext4_count_free_blocks(sb));
+         * need to update the global counters.
-        if (!err)
+         */
-                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+        percpu_counter_set(&sbi->s_freeblocks_counter,
-                                          ext4_count_free_inodes(sb));
+                           ext4_count_free_blocks(sb));
-        if (!err)
+        percpu_counter_set(&sbi->s_freeinodes_counter,
-                err = percpu_counter_init(&sbi->s_dirs_counter,
+                           ext4_count_free_inodes(sb));
-                                          ext4_count_dirs(sb));
+        percpu_counter_set(&sbi->s_dirs_counter,
-        if (!err)
+                           ext4_count_dirs(sb));
-                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
-        if (err) {
-                ext4_msg(sb, KERN_ERR, "insufficient memory");
-                goto failed_mount_wq;
-        }
+no_journal:
        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3127,6 +3579,10 @@ no_journal:
                goto failed_mount4;
        }
+        err = ext4_register_li_request(sb, first_not_zeroed);
+        if (err)
+                goto failed_mount4;
        sbi->s_kobj.kset = ext4_kset;
        init_completion(&sbi->s_kobj_unregister);
        err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@ -3164,7 +3620,6 @@ no_journal:
        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
-        lock_kernel();
        kfree(orig_data);
        return 0;
@@ -3182,10 +3637,6 @@ failed_mount_wq:
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
-        percpu_counter_destroy(&sbi->s_freeblocks_counter);
-        percpu_counter_destroy(&sbi->s_freeinodes_counter);
-        percpu_counter_destroy(&sbi->s_dirs_counter);
-        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3193,6 +3644,10 @@ failed_mount3:
                else
                        kfree(sbi->s_flex_groups);
        }
+        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
+        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -3211,7 +3666,6 @@ out_fail:
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        lock_kernel();
 out_free_orig:
        kfree(orig_data);
        return ret;
@@ -3468,7 +3922,7 @@ static int ext4_load_journal(struct super_block *sb,
        EXT4_SB(sb)->s_journal = journal;
        ext4_clear_journal_err(sb, es);
-        if (journal_devnum &&
+        if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
@@ -3522,9 +3976,10 @@ static int ext4_commit_super(struct super_block *sb, int sync)
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
-                                        &EXT4_SB(sb)->s_freeblocks_counter));
+                                           &EXT4_SB(sb)->s_freeblocks_counter));
-        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+        es->s_free_inodes_count =
-                                        &EXT4_SB(sb)->s_freeinodes_counter));
+                cpu_to_le32(percpu_counter_sum_positive(
+                                &EXT4_SB(sb)->s_freeinodes_counter));
        sb->s_dirt = 0;
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
@@ -3720,8 +4175,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
        char *orig_data = kstrdup(data, GFP_KERNEL);
-        lock_kernel();
        /* Store the original options */
        lock_super(sb);
        old_sb_flags = sb->s_flags;
@@ -3844,6 +4297,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        enable_quota = 1;
                }
        }
+        /*
+         * Reinitialize lazy itable initialization thread based on
+         * current settings
+         */
+        if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+                ext4_unregister_li_request(sb);
+        else {
+                ext4_group_t first_not_zeroed;
+                first_not_zeroed = ext4_has_uninit_itable(sb);
+                ext4_register_li_request(sb, first_not_zeroed);
+        }
        ext4_setup_system_zone(sb);
        if (sbi->s_journal == NULL)
                ext4_commit_super(sb, 1);
@@ -3856,7 +4322,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        kfree(old_opts.s_qf_names[i]);
 #endif
        unlock_super(sb);
-        unlock_kernel();
        if (enable_quota)
                dquot_resume(sb, -1);
@@ -3882,7 +4347,6 @@ restore_opts:
        }
 #endif
        unlock_super(sb);
-        unlock_kernel();
        kfree(orig_data);
        return err;
 }
@@ -4116,12 +4580,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 static int ext4_quota_off(struct super_block *sb, int type)
 {
-        /* Force all delayed allocation blocks to be allocated */
+        /* Force all delayed allocation blocks to be allocated.
-        if (test_opt(sb, DELALLOC)) {
+         * Caller already holds s_umount sem */
-                down_read(&sb->s_umount);
+        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);
-                up_read(&sb->s_umount);
-        }
        return dquot_quota_off(sb, type);
 }
@@ -4227,17 +4689,17 @@ out:
 #endif
-static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data, struct vfsmount *mnt)
+                       const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
 }
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext2",
-        .get_sb         = ext4_get_sb,
+        .mount          = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -4282,28 +4744,58 @@ static inline void unregister_as_ext3(void) { }
 static struct file_system_type ext4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext4",
-        .get_sb         = ext4_get_sb,
+        .mount          = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
-static int __init init_ext4_fs(void)
+int __init ext4_init_feat_adverts(void)
+{
+        struct ext4_features *ef;
+        int ret = -ENOMEM;
+        ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+        if (!ef)
+                goto out;
+        ef->f_kobj.kset = ext4_kset;
+        init_completion(&ef->f_kobj_unregister);
+        ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+                                   "features");
+        if (ret) {
+                kfree(ef);
+                goto out;
+        }
+        ext4_feat = ef;
+        ret = 0;
+out:
+        return ret;
+}
+static int __init ext4_init_fs(void)
 {
        int err;
        ext4_check_flag_values();
-        err = init_ext4_system_zone();
+        err = ext4_init_pageio();
        if (err)
                return err;
+        err = ext4_init_system_zone();
+        if (err)
+                goto out5;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
                goto out4;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
-        err = init_ext4_mballoc();
+        err = ext4_init_feat_adverts();
+        err = ext4_init_mballoc();
        if (err)
                goto out3;
-        err = init_ext4_xattr();
+        err = ext4_init_xattr();
        if (err)
                goto out2;
        err = init_inodecache();
@@ -4314,38 +4806,46 @@ static int __init init_ext4_fs(void)
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;
+        ext4_li_info = NULL;
+        mutex_init(&ext4_li_mtx);
        return 0;
 out:
        unregister_as_ext2();
        unregister_as_ext3();
        destroy_inodecache();
 out1:
-        exit_ext4_xattr();
+        ext4_exit_xattr();
 out2:
-        exit_ext4_mballoc();
+        ext4_exit_mballoc();
 out3:
+        kfree(ext4_feat);
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
 out4:
-        exit_ext4_system_zone();
+        ext4_exit_system_zone();
+out5:
+        ext4_exit_pageio();
        return err;
 }
-static void __exit exit_ext4_fs(void)
+static void __exit ext4_exit_fs(void)
 {
+        ext4_destroy_lazyinit_thread();
        unregister_as_ext2();
        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        destroy_inodecache();
-        exit_ext4_xattr();
+        ext4_exit_xattr();
-        exit_ext4_mballoc();
+        ext4_exit_mballoc();
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
-        exit_ext4_system_zone();
+        ext4_exit_system_zone();
+        ext4_exit_pageio();
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
 MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
-module_init(init_ext4_fs)
+module_init(ext4_init_fs)
-module_exit(exit_ext4_fs)
+module_exit(ext4_exit_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a8cd8dff1ad..fa4b899da4b3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1588,7 +1588,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 #undef BLOCK_HASH_SHIFT
 int __init
-init_ext4_xattr(void)
+ext4_init_xattr(void)
 {
        ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
        if (!ext4_xattr_cache)
@@ -1597,7 +1597,7 @@ init_ext4_xattr(void)
 }
 void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
        if (ext4_xattr_cache)
                mb_cache_destroy(ext4_xattr_cache);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 518e96e43905..1ef16520b950 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -83,8 +83,8 @@ extern void ext4_xattr_put_super(struct super_block *);
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                            struct ext4_inode *raw_inode, handle_t *handle);
-extern int init_ext4_xattr(void);
+extern int __init ext4_init_xattr(void);
-extern void exit_ext4_xattr(void);
+extern void ext4_exit_xattr(void);
 extern const struct xattr_handler *ext4_xattr_handlers[];
@@ -121,14 +121,14 @@ ext4_xattr_put_super(struct super_block *sb)
 {
 }
-static inline int
+static __init inline int
-init_ext4_xattr(void)
+ext4_init_xattr(void)
 {
        return 0;
 }
 static inline void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
 }
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 81184d3b75a3..b47d2c9f4fa1 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -577,7 +577,8 @@ int fat_free_clusters(struct inode *inode, int cluster)
                                sb_issue_discard(sb,
                                        fat_clus_to_blknr(sbi, first_cl),
-                                        nr_clus * sbi->sec_per_clus);
+                                        nr_clus * sbi->sec_per_clus,
+                                        GFP_NOFS, 0);
                                first_cl = cluster;
                        }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 830058057d33..ad6998a92c30 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -14,7 +14,6 @@
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
@@ -489,8 +488,6 @@ static void fat_put_super(struct super_block *sb)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
-        lock_kernel();
        if (sb->s_dirt)
                fat_write_super(sb);
@@ -504,8 +501,6 @@ static void fat_put_super(struct super_block *sb)
        sb->s_fs_info = NULL;
        kfree(sbi);
-        unlock_kernel();
 }
 static struct kmem_cache *fat_inode_cachep;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1736f2356388..970e682ea754 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -255,10 +255,7 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
        for (i = 0; i < nr_bhs; i++) {
                wait_on_buffer(bhs[i]);
-                if (buffer_eopnotsupp(bhs[i])) {
+                if (!err && !buffer_uptodate(bhs[i]))
-                        clear_buffer_eopnotsupp(bhs[i]);
-                        err = -EOPNOTSUPP;
-                } else if (!err && !buffer_uptodate(bhs[i]))
                        err = -EIO;
        }
        return err;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index bbc94ae4fd77..3345aabd1dd7 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -662,27 +662,30 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 {
        int res;
+        lock_super(sb);
        res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
-        if (res)
+        if (res) {
+                unlock_super(sb);
                return res;
+        }
        sb->s_flags |= MS_NOATIME;
        sb->s_root->d_op = &msdos_dentry_operations;
+        unlock_super(sb);
        return 0;
 }
-static int msdos_get_sb(struct file_system_type *fs_type,
+static struct dentry *msdos_mount(struct file_system_type *fs_type,
                        int flags, const char *dev_name,
-                        void *data, struct vfsmount *mnt)
+                        void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
-                           mnt);
 }
 static struct file_system_type msdos_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "msdos",
-        .get_sb         = msdos_get_sb,
+        .mount          = msdos_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6fcc7e71fbaa..b936703b8924 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1055,30 +1055,33 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 {
        int res;
+        lock_super(sb);
        res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
-        if (res)
+        if (res) {
+                unlock_super(sb);
                return res;
+        }
        if (MSDOS_SB(sb)->options.name_check != 's')
                sb->s_root->d_op = &vfat_ci_dentry_ops;
        else
                sb->s_root->d_op = &vfat_dentry_ops;
+        unlock_super(sb);
        return 0;
 }
-static int vfat_get_sb(struct file_system_type *fs_type,
+static struct dentry *vfat_mount(struct file_system_type *fs_type,
                       int flags, const char *dev_name,
-                       void *data, struct vfsmount *mnt)
+                       void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
-                           mnt);
 }
 static struct file_system_type vfat_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "vfat",
-        .get_sb         = vfat_get_sb,
+        .mount          = vfat_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f8cc34f542c3..ecc8b3954ed6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -640,7 +640,7 @@ static void fasync_free_rcu(struct rcu_head *head)
 * match the state "is the filp on a fasync list".
 *
 */
-static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
+int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 {
        struct fasync_struct *fa, **fp;
        int result = 0;
@@ -666,21 +666,31 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
        return result;
 }
+struct fasync_struct *fasync_alloc(void)
+{
+        return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+}
 /*
- * Add a fasync entry. Return negative on error, positive if
+ * NOTE! This can be used only for unused fasync entries:
- * added, and zero if did nothing but change an existing one.
+ * entries that actually got inserted on the fasync list
+ * need to be released by rcu - see fasync_remove_entry.
+ */
+void fasync_free(struct fasync_struct *new)
+{
+        kmem_cache_free(fasync_cache, new);
+}
+/*
+ * Insert a new entry into the fasync list.  Return the pointer to the
+ * old one if we didn't use the new one.
 *
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 */
-static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
 {
-        struct fasync_struct *new, *fa, **fp;
+        struct fasync_struct *fa, **fp;
-        int result = 0;
-        new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
-        if (!new)
-                return -ENOMEM;
        spin_lock(&filp->f_lock);
        spin_lock(&fasync_lock);
@@ -691,8 +701,6 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
                spin_lock_irq(&fa->fa_lock);
                fa->fa_fd = fd;
                spin_unlock_irq(&fa->fa_lock);
-                kmem_cache_free(fasync_cache, new);
                goto out;
        }
@@ -702,13 +710,39 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
        new->fa_fd = fd;
        new->fa_next = *fapp;
        rcu_assign_pointer(*fapp, new);
-        result = 1;
        filp->f_flags |= FASYNC;
 out:
        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
-        return result;
+        return fa;
+}
+/*
+ * Add a fasync entry. Return negative on error, positive if
+ * added, and zero if did nothing but change an existing one.
+ */
+static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+{
+        struct fasync_struct *new;
+        new = fasync_alloc();
+        if (!new)
+                return -ENOMEM;
+        /*
+         * fasync_insert_entry() returns the old (update) entry if
+         * it existed.
+         *
+         * So free the (unused) new entry and return 0 to let the
+         * caller know that we didn't add any new fasync entries.
+         */
+        if (fasync_insert_entry(fd, filp, fapp, new)) {
+                fasync_free(new);
+                return 0;
+        }
+        return 1;
 }
 /*
diff --git a/fs/fifo.c b/fs/fifo.c
index 5d6606ffc2d2..4e303c22d5ee 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -151,4 +151,5 @@ err_nocleanup:
 */
 const struct file_operations def_fifo_fops = {
        .open           = fifo_open,    /* will set read_ or write_pipefifo_fops */
+        .llseek         = noop_llseek,
 };
diff --git a/fs/file_table.c b/fs/file_table.c
index a04bdd81c11c..c3dee381f1b4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -60,7 +60,7 @@ static inline void file_free(struct file *f)
 /*
 * Return the total number of open files in the system
 */
-static int get_nr_files(void)
+static long get_nr_files(void)
 {
        return percpu_counter_read_positive(&nr_files);
 }
@@ -68,7 +68,7 @@ static int get_nr_files(void)
 /*
 * Return the maximum number of open files in the system
 */
-int get_max_files(void)
+unsigned long get_max_files(void)
 {
        return files_stat.max_files;
 }
@@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        files_stat.nr_files = get_nr_files();
-        return proc_dointvec(table, write, buffer, lenp, ppos);
+        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #else
 int proc_nr_files(ctl_table *table, int write,
@@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write,
 struct file *get_empty_filp(void)
 {
        const struct cred *cred = current_cred();
-        static int old_max;
+        static long old_max;
        struct file * f;
        /*
@@ -140,8 +140,7 @@ struct file *get_empty_filp(void)
 over:
        /* Ran out of filps - report that */
        if (get_nr_files() > old_max) {
-                printk(KERN_INFO "VFS: file-max limit %d reached\n",
+                pr_info("VFS: file-max limit %lu reached\n", get_max_files());
-                                        get_max_files());
                old_max = get_nr_files();
        }
        goto fail;
@@ -487,7 +486,7 @@ retry:
 void __init files_init(unsigned long mempages)
 { 
-        int n; 
+        unsigned long n;
        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages)
         */ 
        n = (mempages * (PAGE_SIZE / 1024)) / 10;
-        files_stat.max_files = n; 
+        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-        if (files_stat.max_files < NR_FILE)
-                files_stat.max_files = NR_FILE;
        files_defer_init();
        lg_lock_init(files_lglock);
        percpu_counter_init(&nr_files, 0);
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 79d1b4ea13e7..8c04eac5079d 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -260,6 +260,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
        struct inode                    *ip = NULL;
        if ((ip = new_inode(sbp))) {
+                ip->i_ino = get_next_ino();
                vxfs_iinit(ip, vip);
                ip->i_mapping->a_ops = &vxfs_aops;
        }
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 0ec7bb2c95c6..6c5131d592f0 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -36,7 +36,6 @@
 #include <linux/highmem.h>
 #include <linux/kernel.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include "vxfs.h"
 #include "vxfs_dir.h"
@@ -212,16 +211,12 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd)
        if (dp->d_name.len > VXFS_NAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
                                 
-        lock_kernel();
        ino = vxfs_inode_by_name(dip, dp);
        if (ino) {
                ip = vxfs_iget(dip->i_sb, ino);
-                if (IS_ERR(ip)) {
+                if (IS_ERR(ip))
-                        unlock_kernel();
                        return ERR_CAST(ip);
-                }
        }
-        unlock_kernel();
        d_add(dp, ip);
        return NULL;
 }
@@ -248,8 +243,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
        u_long                  page, npages, block, pblocks, nblocks, offset;
        loff_t                  pos;
-        lock_kernel();
        switch ((long)fp->f_pos) {
        case 0:
                if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
@@ -265,10 +258,8 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
        pos = fp->f_pos - 2;
        
-        if (pos > VXFS_DIRROUND(ip->i_size)) {
+        if (pos > VXFS_DIRROUND(ip->i_size))
-                unlock_kernel();
                return 0;
-        }
        npages = dir_pages(ip);
        nblocks = dir_blocks(ip);
@@ -327,6 +318,5 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
 done:
        fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
 out:
-        unlock_kernel();
        return 0;
 }
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index dc0c041e85cb..9d1c99558389 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -38,7 +38,6 @@
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/stat.h>
 #include <linux/vfs.h>
 #include <linux/mount.h>
@@ -81,16 +80,12 @@ vxfs_put_super(struct super_block *sbp)
 {
        struct vxfs_sb_info     *infp = VXFS_SBI(sbp);
-        lock_kernel();
        vxfs_put_fake_inode(infp->vsi_fship);
        vxfs_put_fake_inode(infp->vsi_ilist);
        vxfs_put_fake_inode(infp->vsi_stilist);
        brelse(infp->vsi_bp);
        kfree(infp);
-        unlock_kernel();
 }
 /**
@@ -148,7 +143,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
 *   The superblock on success, else %NULL.
 *
 * Locking:
- *   We are under the bkl and @sbp->s_lock.
+ *   We are under @sbp->s_lock.
 */
 static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
 {
@@ -251,17 +246,16 @@ out:
 /*
 * The usual module blurb.
 */
-static int vxfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *vxfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
-                           mnt);
 }
 static struct file_system_type vxfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "vxfs",
-        .get_sb         = vxfs_get_sb,
+        .mount          = vxfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ab38fef1c9a1..3d06ccc953aa 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -79,6 +79,11 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
        return sb->s_bdi;
 }
+static inline struct inode *wb_inode(struct list_head *head)
+{
+        return list_entry(head, struct inode, i_wb_list);
+}
 static void bdi_queue_work(struct backing_dev_info *bdi,
                struct wb_writeback_work *work)
 {
@@ -172,11 +177,11 @@ static void redirty_tail(struct inode *inode)
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;
-                tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+                tail = wb_inode(wb->b_dirty.next);
                if (time_before(inode->dirtied_when, tail->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
-        list_move(&inode->i_list, &wb->b_dirty);
+        list_move(&inode->i_wb_list, &wb->b_dirty);
 }
 /*
@@ -186,7 +191,7 @@ static void requeue_io(struct inode *inode)
 {
        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-        list_move(&inode->i_list, &wb->b_more_io);
+        list_move(&inode->i_wb_list, &wb->b_more_io);
 }
 static void inode_sync_complete(struct inode *inode)
@@ -227,14 +232,14 @@ static void move_expired_inodes(struct list_head *delaying_queue,
        int do_sb_sort = 0;
        while (!list_empty(delaying_queue)) {
-                inode = list_entry(delaying_queue->prev, struct inode, i_list);
+                inode = wb_inode(delaying_queue->prev);
                if (older_than_this &&
                    inode_dirtied_after(inode, *older_than_this))
                        break;
                if (sb && sb != inode->i_sb)
                        do_sb_sort = 1;
                sb = inode->i_sb;
-                list_move(&inode->i_list, &tmp);
+                list_move(&inode->i_wb_list, &tmp);
        }
        /* just one sb in list, splice to dispatch_queue and we're done */
@@ -245,12 +250,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
        /* Move inodes from one superblock together */
        while (!list_empty(&tmp)) {
-                inode = list_entry(tmp.prev, struct inode, i_list);
+                sb = wb_inode(tmp.prev)->i_sb;
-                sb = inode->i_sb;
                list_for_each_prev_safe(pos, node, &tmp) {
-                        inode = list_entry(pos, struct inode, i_list);
+                        inode = wb_inode(pos);
                        if (inode->i_sb == sb)
-                                list_move(&inode->i_list, dispatch_queue);
+                                list_move(&inode->i_wb_list, dispatch_queue);
                }
        }
 }
@@ -408,16 +412,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                         * completion.
                         */
                        redirty_tail(inode);
-                } else if (atomic_read(&inode->i_count)) {
-                        /*
-                         * The inode is clean, inuse
-                         */
-                        list_move(&inode->i_list, &inode_in_use);
                } else {
                        /*
-                         * The inode is clean, unused
+                         * The inode is clean.  At this point we either have
+                         * a reference to the inode or it's on it's way out.
+                         * No need to add it back to the LRU.
                         */
-                        list_move(&inode->i_list, &inode_unused);
+                        list_del_init(&inode->i_wb_list);
                }
        }
        inode_sync_complete(inode);
@@ -465,8 +466,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 {
        while (!list_empty(&wb->b_io)) {
                long pages_skipped;
-                struct inode *inode = list_entry(wb->b_io.prev,
+                struct inode *inode = wb_inode(wb->b_io.prev);
-                                                 struct inode, i_list);
                if (inode->i_sb != sb) {
                        if (only_this_sb) {
@@ -487,10 +487,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                        return 0;
                }
-                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
+                /*
+                 * Don't bother with new inodes or inodes beeing freed, first
+                 * kind does not need peridic writeout yet, and for the latter
+                 * kind writeout is handled by the freer.
+                 */
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
                }
                /*
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
@@ -498,7 +504,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                if (inode_dirtied_after(inode, wbc->wb_start))
                        return 1;
-                BUG_ON(inode->i_state & I_FREEING);
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
                writeback_single_inode(inode, wbc);
@@ -536,8 +541,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
                queue_io(wb, wbc->older_than_this);
        while (!list_empty(&wb->b_io)) {
-                struct inode *inode = list_entry(wb->b_io.prev,
+                struct inode *inode = wb_inode(wb->b_io.prev);
-                                                 struct inode, i_list);
                struct super_block *sb = inode->i_sb;
                if (!pin_sb_for_writeback(sb)) {
@@ -582,7 +586,7 @@ static inline bool over_bground_thresh(void)
        global_dirty_limits(&background_thresh, &dirty_thresh);
        return (global_page_state(NR_FILE_DIRTY) +
-                global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+                global_page_state(NR_UNSTABLE_NFS) > background_thresh);
 }
 /*
@@ -675,8 +679,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                 */
                spin_lock(&inode_lock);
                if (!list_empty(&wb->b_more_io))  {
-                        inode = list_entry(wb->b_more_io.prev,
+                        inode = wb_inode(wb->b_more_io.prev);
-                                                struct inode, i_list);
                        trace_wbc_writeback_wait(&wbc, wb->bdi);
                        inode_wait_for_writeback(inode);
                }
@@ -704,6 +707,17 @@ get_next_work_item(struct backing_dev_info *bdi)
        return work;
 }
+/*
+ * Add in the number of potentially dirty inodes, because each inode
+ * write can dirty pagecache in the underlying blockdev.
+ */
+static unsigned long get_nr_dirty_pages(void)
+{
+        return global_page_state(NR_FILE_DIRTY) +
+                global_page_state(NR_UNSTABLE_NFS) +
+                get_nr_dirty_inodes();
+}
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
 {
        unsigned long expired;
@@ -721,9 +735,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
                return 0;
        wb->last_old_flush = jiffies;
-        nr_pages = global_page_state(NR_FILE_DIRTY) +
+        nr_pages = get_nr_dirty_pages();
-                        global_page_state(NR_UNSTABLE_NFS) +
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        if (nr_pages) {
                struct wb_writeback_work work = {
@@ -790,7 +802,7 @@ int bdi_writeback_thread(void *data)
        struct backing_dev_info *bdi = wb->bdi;
        long pages_written;
-        current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+        current->flags |= PF_SWAPWRITE;
        set_freezable();
        wb->last_active = jiffies;
@@ -962,7 +974,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 * dirty list.  Add blockdev inodes as well.
                 */
                if (!S_ISBLK(inode->i_mode)) {
-                        if (hlist_unhashed(&inode->i_hash))
+                        if (inode_unhashed(inode))
                                goto out;
                }
                if (inode->i_state & I_FREEING)
@@ -990,7 +1002,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                        }
                        inode->dirtied_when = jiffies;
-                        list_move(&inode->i_list, &bdi->wb.b_dirty);
+                        list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
                }
        }
 out:
@@ -1069,33 +1081,42 @@ static void wait_sb_inodes(struct super_block *sb)
 }
 /**
- * writeback_inodes_sb  -       writeback dirty inodes from given super_block
+ * writeback_inodes_sb_nr -     writeback dirty inodes from given super_block
 * @sb: the superblock
+ * @nr: the number of pages to write
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
- * for IO completion of submitted IO. The number of pages submitted is
+ * for IO completion of submitted IO.
- * returned.
 */
-void writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
 {
-        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
        DECLARE_COMPLETION_ONSTACK(done);
        struct wb_writeback_work work = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_NONE,
                .done           = &done,
+                .nr_pages       = nr,
        };
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-        work.nr_pages = nr_dirty + nr_unstable +
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        bdi_queue_work(sb->s_bdi, &work);
        wait_for_completion(&done);
 }
+EXPORT_SYMBOL(writeback_inodes_sb_nr);
+/**
+ * writeback_inodes_sb  -       writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO.
+ */
+void writeback_inodes_sb(struct super_block *sb)
+{
+        return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
+}
 EXPORT_SYMBOL(writeback_inodes_sb);
 /**
@@ -1118,6 +1139,27 @@ int writeback_inodes_sb_if_idle(struct super_block *sb)
 EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
 /**
+ * writeback_inodes_sb_if_idle  -       start writeback if none underway
+ * @sb: the superblock
+ * @nr: the number of pages to write
+ *
+ * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Returns 1 if writeback was started, 0 if not.
+ */
+int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
+                                   unsigned long nr)
+{
+        if (!writeback_in_progress(sb->s_bdi)) {
+                down_read(&sb->s_umount);
+                writeback_inodes_sb_nr(sb, nr);
+                up_read(&sb->s_umount);
+                return 1;
+        } else
+                return 0;
+}
+EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
+/**
 * sync_inodes_sb       -       sync sb inode pages
 * @sb: the superblock
 *
@@ -1198,3 +1240,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
        return ret;
 }
 EXPORT_SYMBOL(sync_inode);
+/**
+ * sync_inode - write an inode to disk
+ * @inode: the inode to sync
+ * @wait: wait for I/O to complete.
+ *
+ * Write an inode to disk and adjust it's dirty state after completion.
+ *
+ * Note: only writes the actual inode, no associated data or other metadata.
+ */
+int sync_inode_metadata(struct inode *inode, int wait)
+{
+        struct writeback_control wbc = {
+                .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+                .nr_to_write = 0, /* metadata-only */
+        };
+        return sync_inode(inode, &wbc);
+}
+EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 3773fd63d2f9..85542a7daf40 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -179,23 +179,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
 static const struct file_operations fuse_ctl_abort_ops = {
        .open = nonseekable_open,
        .write = fuse_conn_abort_write,
+        .llseek = no_llseek,
 };
 static const struct file_operations fuse_ctl_waiting_ops = {
        .open = nonseekable_open,
        .read = fuse_conn_waiting_read,
+        .llseek = no_llseek,
 };
 static const struct file_operations fuse_conn_max_background_ops = {
        .open = nonseekable_open,
        .read = fuse_conn_max_background_read,
        .write = fuse_conn_max_background_write,
+        .llseek = no_llseek,
 };
 static const struct file_operations fuse_conn_congestion_threshold_ops = {
        .open = nonseekable_open,
        .read = fuse_conn_congestion_threshold_read,
        .write = fuse_conn_congestion_threshold_write,
+        .llseek = no_llseek,
 };
 static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
@@ -218,6 +222,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
        if (!inode)
                return NULL;
+        inode->i_ino = get_next_ino();
        inode->i_mode = mode;
        inode->i_uid = fc->user_id;
        inode->i_gid = fc->group_id;
@@ -317,12 +322,10 @@ static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
-static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type,
-                        const char *dev_name, void *raw_data,
+                        int flags, const char *dev_name, void *raw_data)
-                        struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, raw_data,
+        return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super);
-                                fuse_ctl_fill_super, mnt);
 }
 static void fuse_ctl_kill_sb(struct super_block *sb)
@@ -341,7 +344,7 @@ static void fuse_ctl_kill_sb(struct super_block *sb)
 static struct file_system_type fuse_ctl_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fusectl",
-        .get_sb         = fuse_ctl_get_sb,
+        .mount          = fuse_ctl_mount,
        .kill_sb        = fuse_ctl_kill_sb,
 };
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e1f8171278bd..3e87cce5837d 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -182,6 +182,7 @@ static const struct file_operations cuse_frontend_fops = {
        .unlocked_ioctl         = cuse_file_ioctl,
        .compat_ioctl           = cuse_file_compat_ioctl,
        .poll                   = fuse_file_poll,
+        .llseek         = noop_llseek,
 };
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cde755cca564..6e07696308dc 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -809,11 +809,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
        int err;
        struct page *page = *pagep;
-        if (page && zeroing && count < PAGE_SIZE) {
+        if (page && zeroing && count < PAGE_SIZE)
-                void *mapaddr = kmap_atomic(page, KM_USER1);
+                clear_highpage(page);
-                memset(mapaddr, 0, PAGE_SIZE);
-                kunmap_atomic(mapaddr, KM_USER1);
-        }
        while (count) {
                if (cs->write && cs->pipebufs && page) {
                        return fuse_ref_page(cs, page, offset, count);
@@ -830,10 +828,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
                        }
                }
                if (page) {
-                        void *mapaddr = kmap_atomic(page, KM_USER1);
+                        void *mapaddr = kmap_atomic(page, KM_USER0);
                        void *buf = mapaddr + offset;
                        offset += fuse_copy_do(cs, &buf, &count);
-                        kunmap_atomic(mapaddr, KM_USER1);
+                        kunmap_atomic(mapaddr, KM_USER0);
                } else
                        offset += fuse_copy_do(cs, NULL, &count);
        }
@@ -1336,12 +1334,7 @@ out_finish:
 static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-        int i;
+        release_pages(req->pages, req->num_pages, 0);
-        for (i = 0; i < req->num_pages; i++) {
-                struct page *page = req->pages[i];
-                page_cache_release(page);
-        }
 }
 static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index da9e6e11374c..cfce3ad86a92 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1041,11 +1041,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        return err;
 }
-static int fuse_get_sb(struct file_system_type *fs_type,
+static struct dentry *fuse_mount(struct file_system_type *fs_type,
                       int flags, const char *dev_name,
-                       void *raw_data, struct vfsmount *mnt)
+                       void *raw_data)
 {
-        return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
+        return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
 }
 static void fuse_kill_sb_anon(struct super_block *sb)
@@ -1065,17 +1065,16 @@ static struct file_system_type fuse_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fuse",
        .fs_flags       = FS_HAS_SUBTYPE,
-        .get_sb         = fuse_get_sb,
+        .mount          = fuse_mount,
        .kill_sb        = fuse_kill_sb_anon,
 };
 #ifdef CONFIG_BLOCK
-static int fuse_get_sb_blk(struct file_system_type *fs_type,
+static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
                           int flags, const char *dev_name,
-                           void *raw_data, struct vfsmount *mnt)
+                           void *raw_data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
-                           mnt);
 }
 static void fuse_kill_sb_blk(struct super_block *sb)
@@ -1094,7 +1093,7 @@ static void fuse_kill_sb_blk(struct super_block *sb)
 static struct file_system_type fuseblk_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "fuseblk",
-        .get_sb         = fuse_get_sb_blk,
+        .mount          = fuse_mount_blk,
        .kill_sb        = fuse_kill_sb_blk,
        .fs_flags       = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
 };
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 6b24afb96aae..4f36f8832b9b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -618,7 +618,6 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        struct gfs2_alloc *al = NULL;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
-        unsigned to = from + len;
        struct page *page;
        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -691,7 +690,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        }
 prepare_write:
-        error = block_prepare_write(page, from, to, gfs2_block_map);
+        error = __block_write_begin(page, from, len, gfs2_block_map);
 out:
        if (error == 0)
                return 0;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 06d582732d34..5ab3839dfcb9 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -138,10 +138,8 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
                                      struct gfs2_inum_host *inum)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
-        struct gfs2_holder i_gh;
        struct inode *inode;
        struct dentry *dentry;
-        int error;
        inode = gfs2_ilookup(sb, inum->no_addr);
        if (inode) {
@@ -152,52 +150,16 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
                goto out_inode;
        }
-        error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
+        inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
-                                  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+                                    GFS2_BLKST_DINODE);
-        if (error)
+        if (IS_ERR(inode))
-                return ERR_PTR(error);
+                return ERR_CAST(inode);
-        error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
-        if (error)
-                goto fail;
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
-        if (IS_ERR(inode)) {
-                error = PTR_ERR(inode);
-                goto fail;
-        }
-        error = gfs2_inode_refresh(GFS2_I(inode));
-        if (error) {
-                iput(inode);
-                goto fail;
-        }
-        /* Pick up the works we bypass in gfs2_inode_lookup */
-        if (inode->i_state & I_NEW) 
-                gfs2_set_iop(inode);
-        if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
-                iput(inode);
-                goto fail;
-        }
-        error = -EIO;
-        if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
-                iput(inode);
-                goto fail;
-        }
-        gfs2_glock_dq_uninit(&i_gh);
 out_inode:
        dentry = d_obtain_alias(inode);
        if (!IS_ERR(dentry))
                dentry->d_op = &gfs2_dops;
        return dentry;
-fail:
-        gfs2_glock_dq_uninit(&i_gh);
-        return ERR_PTR(error);
 }
 static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 237ee6a940df..aa996471ec5c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -622,6 +622,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 * cluster; until we do, disable leases (by just returning -EINVAL),
 * unless the administrator has requested purely local locking.
 *
+ * Locking: called under lock_flocks
+ *
 * Returns: errno
 */
@@ -773,6 +775,7 @@ const struct file_operations gfs2_dir_fops = {
        .fsync          = gfs2_fsync,
        .lock           = gfs2_lock,
        .flock          = gfs2_flock,
+        .llseek         = default_llseek,
 };
 #endif /* CONFIG_GFS2_FS_LOCKING_DLM */
@@ -799,5 +802,6 @@ const struct file_operations gfs2_dir_fops_nolock = {
        .open           = gfs2_open,
        .release        = gfs2_close,
        .fsync          = gfs2_fsync,
+        .llseek         = default_llseek,
 };
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 87778857f099..f92c17704169 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -686,21 +686,20 @@ static void delete_work_func(struct work_struct *work)
 {
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct gfs2_inode *ip = NULL;
+        struct gfs2_inode *ip;
        struct inode *inode;
-        u64 no_addr = 0;
+        u64 no_addr = gl->gl_name.ln_number;
+        ip = gl->gl_object;
+        /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
-        spin_lock(&gl->gl_spin);
-        ip = (struct gfs2_inode *)gl->gl_object;
        if (ip)
-                no_addr = ip->i_no_addr;
-        spin_unlock(&gl->gl_spin);
-        if (ip) {
                inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
-                if (inode) {
+        else
-                        d_prune_aliases(inode);
+                inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
-                        iput(inode);
+        if (inode && !IS_ERR(inode)) {
-                }
+                d_prune_aliases(inode);
+                iput(inode);
        }
        gfs2_glock_put(gl);
 }
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 06370f8bd8cf..e1213f7f9217 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -73,49 +73,6 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
        return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
 }
-struct gfs2_skip_data {
-        u64     no_addr;
-        int     skipped;
-};
-static int iget_skip_test(struct inode *inode, void *opaque)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_skip_data *data = opaque;
-        if (ip->i_no_addr == data->no_addr) {
-                if (inode->i_state & (I_FREEING|I_WILL_FREE)){
-                        data->skipped = 1;
-                        return 0;
-                }
-                return 1;
-        }
-        return 0;
-}
-static int iget_skip_set(struct inode *inode, void *opaque)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_skip_data *data = opaque;
-        if (data->skipped)
-                return 1;
-        inode->i_ino = (unsigned long)(data->no_addr);
-        ip->i_no_addr = data->no_addr;
-        return 0;
-}
-static struct inode *gfs2_iget_skip(struct super_block *sb,
-                                    u64 no_addr)
-{
-        struct gfs2_skip_data data;
-        unsigned long hash = (unsigned long)no_addr;
-        data.no_addr = no_addr;
-        data.skipped = 0;
-        return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
-}
 /**
 * GFS2 lookup code fills in vfs inode contents based on info obtained
 * from directory entry inside gfs2_inode_lookup(). This has caused issues
@@ -243,93 +200,54 @@ fail:
        return ERR_PTR(error);
 }
-/**
+struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
- * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
+                                  u64 *no_formal_ino, unsigned int blktype)
- *                               and try to reclaim it by doing iput.
- *
- * This function assumes no rgrp locks are currently held.
- *
- * @sb: The super block
- * no_addr: The inode number
- *
- */
-void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
 {
-        struct gfs2_sbd *sdp;
+        struct super_block *sb = sdp->sd_vfs;
-        struct gfs2_inode *ip;
+        struct gfs2_holder i_gh;
-        struct gfs2_glock *io_gl = NULL;
-        int error;
-        struct gfs2_holder gh;
        struct inode *inode;
+        int error;
-        inode = gfs2_iget_skip(sb, no_addr);
+        error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
+                                  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-        if (!inode)
+        if (error)
-                return;
+                return ERR_PTR(error);
-        /* If it's not a new inode, someone's using it, so leave it alone. */
-        if (!(inode->i_state & I_NEW)) {
-                iput(inode);
-                return;
-        }
-        ip = GFS2_I(inode);
-        sdp = GFS2_SB(inode);
-        ip->i_no_formal_ino = -1;
-        error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+        error = gfs2_check_blk_type(sdp, no_addr, blktype);
-        if (unlikely(error))
+        if (error)
                goto fail;
-        ip->i_gl->gl_object = ip;
-        error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
-        if (unlikely(error))
+        if (IS_ERR(inode))
-                goto fail_put;
+                goto fail;
-        set_bit(GIF_INVALID, &ip->i_flags);
-        error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
-                                   &ip->i_iopen_gh);
-        if (unlikely(error))
-                goto fail_iopen;
-        ip->i_iopen_gh.gh_gl->gl_object = ip;
+        error = gfs2_inode_refresh(GFS2_I(inode));
-        gfs2_glock_put(io_gl);
+        if (error)
-        io_gl = NULL;
+                goto fail_iput;
-        inode->i_mode = DT2IF(DT_UNKNOWN);
+        /* Pick up the works we bypass in gfs2_inode_lookup */
+        if (inode->i_state & I_NEW) 
+                gfs2_set_iop(inode);
-        /*
+        /* Two extra checks for NFS only */
-         * We must read the inode in order to work out its type in
+        if (no_formal_ino) {
-         * this case. Note that this doesn't happen often as we normally
+                error = -ESTALE;
-         * know the type beforehand. This code path only occurs during
+                if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
-         * unlinked inode recovery (where it is safe to do this glock,
+                        goto fail_iput;
-         * which is not true in the general case).
-         */
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
-                                   &gh);
-        if (unlikely(error))
-                goto fail_glock;
-        /* Inode is now uptodate */
+                error = -EIO;
-        gfs2_glock_dq_uninit(&gh);
+                if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
-        gfs2_set_iop(inode);
+                        goto fail_iput;
-        /* The iput will cause it to be deleted. */
+                error = 0;
-        iput(inode);
+        }
-        return;
-fail_glock:
-        gfs2_glock_dq(&ip->i_iopen_gh);
-fail_iopen:
-        if (io_gl)
-                gfs2_glock_put(io_gl);
-fail_put:
-        ip->i_gl->gl_object = NULL;
-        gfs2_glock_put(ip->i_gl);
 fail:
-        iget_failed(inode);
+        gfs2_glock_dq_uninit(&i_gh);
-        return;
+        return error ? ERR_PTR(error) : inode;
+fail_iput:
+        iput(inode);
+        goto fail;
 }
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6720d7d5fbc6..d8499fadcc53 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -99,7 +99,9 @@ err:
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
                                       u64 no_addr, u64 no_formal_ino);
-extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
+extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+                                         u64 *no_formal_ino,
+                                         unsigned int blktype);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ac750bd31a6f..eb01f3575e10 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -592,22 +592,13 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        lh->lh_hash = cpu_to_be32(hash);
        bh->b_end_io = end_buffer_write_sync;
-        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
-                goto skip_barrier;
        get_bh(bh);
-        submit_bh(WRITE_BARRIER | REQ_META, bh);
+        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
-        wait_on_buffer(bh);
-        if (buffer_eopnotsupp(bh)) {
-                clear_buffer_eopnotsupp(bh);
-                set_buffer_uptodate(bh);
-                fs_info(sdp, "barrier sync failed - disabling barriers\n");
-                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
-                lock_buffer(bh);
-skip_barrier:
-                get_bh(bh);
                submit_bh(WRITE_SYNC | REQ_META, bh);
-                wait_on_buffer(bh);
+        else
-        }
+                submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
+        wait_on_buffer(bh);
        if (!buffer_uptodate(bh))
                gfs2_io_error_bh(sdp, bh);
        brelse(bh);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index d7eb1e209aa8..ebef7ab6e17e 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -144,7 +144,7 @@ static int __init init_gfs2_fs(void)
        error = -ENOMEM;
        gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-                                          WQ_RESCUER | WQ_FREEZEABLE, 0);
+                                          WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0);
        if (!gfs_recovery_wq)
                goto fail_wq;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index f3b071f921aa..939739c7b3f9 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
                 * activity, but those code paths have their own higher-level
                 * throttling.
                 */
-                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else if (!trylock_buffer(bh)) {
                        redirty_page_for_writepage(wbc, page);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index aeafc233dc89..3eb1393f7b81 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1219,7 +1219,6 @@ fail_sb:
 fail_locking:
        init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-        invalidate_inodes(sb);
        gfs2_gl_hash_clear(sdp);
        gfs2_lm_unmount(sdp);
 fail_sys:
@@ -1251,12 +1250,11 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
 }
 /**
- * gfs2_get_sb - Get the GFS2 superblock
+ * gfs2_mount - Get the GFS2 superblock
 * @fs_type: The GFS2 filesystem type
 * @flags: Mount flags
 * @dev_name: The name of the device
 * @data: The mount arguments
- * @mnt: The vfsmnt for this mount
 *
 * Q. Why not use get_sb_bdev() ?
 * A. We need to select one of two root directories to mount, independent
@@ -1265,8 +1263,8 @@ static int test_gfs2_super(struct super_block *s, void *ptr)
 * Returns: 0 or -ve on error
 */
-static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
-                       const char *dev_name, void *data, struct vfsmount *mnt)
+                       const char *dev_name, void *data)
 {
        struct block_device *bdev;
        struct super_block *s;
@@ -1280,7 +1278,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        bdev = open_bdev_exclusive(dev_name, mode, fs_type);
        if (IS_ERR(bdev))
-                return PTR_ERR(bdev);
+                return ERR_CAST(bdev);
        /*
         * once the super is inserted into the list by sget, s_umount
@@ -1299,6 +1297,9 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        if (IS_ERR(s))
                goto error_bdev;
+        if (s->s_root)
+                close_bdev_exclusive(bdev, mode);
        memset(&args, 0, sizeof(args));
        args.ar_quota = GFS2_QUOTA_DEFAULT;
        args.ar_data = GFS2_DATA_DEFAULT;
@@ -1310,17 +1311,13 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        error = gfs2_mount_args(&args, data);
        if (error) {
                printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
-                if (s->s_root)
+                goto error_super;
-                        goto error_super;
-                deactivate_locked_super(s);
-                return error;
        }
        if (s->s_root) {
                error = -EBUSY;
                if ((flags ^ s->s_flags) & MS_RDONLY)
                        goto error_super;
-                close_bdev_exclusive(bdev, mode);
        } else {
                char b[BDEVNAME_SIZE];
@@ -1329,27 +1326,24 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
                sb_set_blocksize(s, block_size(bdev));
                error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
-                if (error) {
+                if (error)
-                        deactivate_locked_super(s);
+                        goto error_super;
-                        return error;
-                }
                s->s_flags |= MS_ACTIVE;
                bdev->bd_super = s;
        }
        sdp = s->s_fs_info;
-        mnt->mnt_sb = s;
        if (args.ar_meta)
-                mnt->mnt_root = dget(sdp->sd_master_dir);
+                return dget(sdp->sd_master_dir);
        else
-                mnt->mnt_root = dget(sdp->sd_root_dir);
+                return dget(sdp->sd_root_dir);
-        return 0;
 error_super:
        deactivate_locked_super(s);
+        return ERR_PTR(error);
 error_bdev:
        close_bdev_exclusive(bdev, mode);
-        return error;
+        return ERR_PTR(error);
 }
 static int set_meta_super(struct super_block *s, void *ptr)
@@ -1357,8 +1351,8 @@ static int set_meta_super(struct super_block *s, void *ptr)
        return -EINVAL;
 }
-static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
+static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
-                            const char *dev_name, void *data, struct vfsmount *mnt)
+                        int flags, const char *dev_name, void *data)
 {
        struct super_block *s;
        struct gfs2_sbd *sdp;
@@ -1369,23 +1363,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
        if (error) {
                printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
                       dev_name, error);
-                return error;
+                return ERR_PTR(error);
        }
        s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
                 path.dentry->d_inode->i_sb->s_bdev);
        path_put(&path);
        if (IS_ERR(s)) {
                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        }
        if ((flags ^ s->s_flags) & MS_RDONLY) {
                deactivate_locked_super(s);
-                return -EBUSY;
+                return ERR_PTR(-EBUSY);
        }
        sdp = s->s_fs_info;
-        mnt->mnt_sb = s;
+        return dget(sdp->sd_master_dir);
-        mnt->mnt_root = dget(sdp->sd_master_dir);
-        return 0;
 }
 static void gfs2_kill_sb(struct super_block *sb)
@@ -1411,7 +1403,7 @@ static void gfs2_kill_sb(struct super_block *sb)
 struct file_system_type gfs2_fs_type = {
        .name = "gfs2",
        .fs_flags = FS_REQUIRES_DEV,
-        .get_sb = gfs2_get_sb,
+        .mount = gfs2_mount,
        .kill_sb = gfs2_kill_sb,
        .owner = THIS_MODULE,
 };
@@ -1419,7 +1411,7 @@ struct file_system_type gfs2_fs_type = {
 struct file_system_type gfs2meta_fs_type = {
        .name = "gfs2meta",
        .fs_flags = FS_REQUIRES_DEV,
-        .get_sb = gfs2_get_sb_meta,
+        .mount = gfs2_mount_meta,
        .owner = THIS_MODULE,
 };
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 0534510200d5..12cbea7502c2 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -255,7 +255,7 @@ out_parent:
        gfs2_holder_uninit(ghs);
        gfs2_holder_uninit(ghs + 1);
        if (!error) {
-                atomic_inc(&inode->i_count);
+                ihold(inode);
                d_instantiate(dentry, inode);
                mark_inode_dirty(inode);
        }
@@ -1294,7 +1294,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
        int error;
        if (!page_has_buffers(page)) {
-                error = block_prepare_write(page, from, to, gfs2_block_map);
+                error = __block_write_begin(page, from, to - from, gfs2_block_map);
                if (unlikely(error))
                        return error;
@@ -1313,7 +1313,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
                next += bh->b_size;
                if (buffer_mapped(bh)) {
                        if (end) {
-                                error = block_prepare_write(page, start, end,
+                                error = __block_write_begin(page, start, end - start,
                                                            gfs2_block_map);
                                if (unlikely(error))
                                        return error;
@@ -1328,7 +1328,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
        } while (next < to);
        if (end) {
-                error = block_prepare_write(page, start, end, gfs2_block_map);
+                error = __block_write_begin(page, start, end - start, gfs2_block_map);
                if (unlikely(error))
                        return error;
                empty_write_end(page, start, end);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fb67f593f408..33c8407b876f 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -866,8 +866,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
                                if ((start + nr_sects) != blk) {
                                        rv = blkdev_issue_discard(bdev, start,
                                                            nr_sects, GFP_NOFS,
-                                                            BLKDEV_IFL_WAIT |
+                                                            0);
-                                                            BLKDEV_IFL_BARRIER);
                                        if (rv)
                                                goto fail;
                                        nr_sects = 0;
@@ -881,8 +880,7 @@ start_new_extent:
                }
        }
        if (nr_sects) {
-                rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
+                rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
-                                         BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
                if (rv)
                        goto fail;
        }
@@ -965,17 +963,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 *          The inode, if one has been found, in inode.
 */
-static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
-                           u64 skip)
 {
        u32 goal = 0, block;
        u64 no_addr;
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        unsigned int n;
+        struct gfs2_glock *gl;
+        struct gfs2_inode *ip;
+        int error;
+        int found = 0;
-        for(;;) {
+        while (goal < rgd->rd_data) {
-                if (goal >= rgd->rd_data)
-                        break;
                down_write(&sdp->sd_log_flush_lock);
                n = 1;
                block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
@@ -992,11 +991,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
                if (no_addr == skip)
                        continue;
                *last_unlinked = no_addr;
-                return no_addr;
+                error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
+                if (error)
+                        continue;
+                /* If the inode is already in cache, we can ignore it here
+                 * because the existing inode disposal code will deal with
+                 * it when all refs have gone away. Accessing gl_object like
+                 * this is not safe in general. Here it is ok because we do
+                 * not dereference the pointer, and we only need an approx
+                 * answer to whether it is NULL or not.
+                 */
+                ip = gl->gl_object;
+                if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+                        gfs2_glock_put(gl);
+                else
+                        found++;
+                /* Limit reclaim to sensible number of tasks */
+                if (found > 2*NR_CPUS)
+                        return;
        }
        rgd->rd_flags &= ~GFS2_RDF_CHECK;
-        return 0;
+        return;
 }
 /**
@@ -1077,11 +1097,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
 * Try to acquire rgrp in way which avoids contending with others.
 *
 * Returns: errno
- *          unlinked: the block address of an unlinked block to be reclaimed
 */
-static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
-                          u64 *last_unlinked)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd, *begin = NULL;
@@ -1091,7 +1109,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
        int loops = 0;
        int error, rg_locked;
-        *unlinked = 0;
        rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
        while (rgd) {
@@ -1108,17 +1125,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        /* If the rg came in already locked, there's no
+                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                           way we can recover from a failed try_rgrp_unlink
+                                try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
-                           because that would require an iput which can only
-                           happen after the rgrp is unlocked. */
-                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
-                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
-                                                           ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (*unlinked)
-                                return -EAGAIN;
                        /* fall through */
                case GLR_TRYFAILED:
                        rgd = recent_rgrp_next(rgd);
@@ -1147,13 +1157,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+                                try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
-                                                            ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (*unlinked)
-                                return -EAGAIN;
                        break;
                case GLR_TRYFAILED:
@@ -1206,12 +1213,12 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
        int error = 0;
-        u64 last_unlinked = NO_BLOCK, unlinked;
+        u64 last_unlinked = NO_BLOCK;
+        int tries = 0;
        if (gfs2_assert_warn(sdp, al->al_requested))
                return -EINVAL;
-try_again:
        if (hold_rindex) {
                /* We need to hold the rindex unless the inode we're using is
                   the rindex itself, in which case it's already held. */
@@ -1220,31 +1227,23 @@ try_again:
                else if (!sdp->sd_rgrps) /* We may not have the rindex read
                                            in, so: */
                        error = gfs2_ri_update_special(ip);
+                if (error)
+                        return error;
        }
-        if (error)
+        do {
-                return error;
+                error = get_local_rgrp(ip, &last_unlinked);
+                /* If there is no space, flushing the log may release some */
+                if (error)
+                        gfs2_log_flush(sdp, NULL);
+        } while (error && tries++ < 3);
-        /* Find an rgrp suitable for allocation.  If it encounters any unlinked
-           dinodes along the way, error will equal -EAGAIN and unlinked will
-           contains it block address. We then need to look up that inode and
-           try to free it, and try the allocation again. */
-        error = get_local_rgrp(ip, &unlinked, &last_unlinked);
        if (error) {
                if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
                        gfs2_glock_dq_uninit(&al->al_ri_gh);
-                if (error != -EAGAIN)
+                return error;
-                        return error;
-                gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
-                /* regardless of whether or not gfs2_process_unlinked_inode
-                   was successful, we don't want to repeat it again. */
-                last_unlinked = unlinked;
-                gfs2_log_flush(sdp, NULL);
-                error = 0;
-                goto try_again;
        }
        /* no error, so we have the rgrp set in the inode's allocation. */
        al->al_file = file;
        al->al_line = line;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 047d1176096c..2b2c4997430b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -857,7 +857,6 @@ restart:
        gfs2_clear_rgrpd(sdp);
        gfs2_jindex_free(sdp);
        /*  Take apart glock structures and buffer lists  */
-        invalidate_inodes(sdp->sd_vfs);
        gfs2_gl_hash_clear(sdp);
        /*  Unmount the locking protocol  */
        gfs2_lm_unmount(sdp);
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 4f55651aaa51..c8cffb81e849 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -147,8 +147,6 @@ struct hfs_sb_info {
        u16 blockoffset;
        int fs_div;
-        struct hlist_head rsrc_inodes;
 };
 #define HFS_FLG_BITMAP_DIRTY    0
@@ -254,17 +252,6 @@ static inline void hfs_bitmap_dirty(struct super_block *sb)
        sb->s_dirt = 1;
 }
-static inline void hfs_buffer_sync(struct buffer_head *bh)
-{
-        while (buffer_locked(bh)) {
-                wait_on_buffer(bh);
-        }
-        if (buffer_dirty(bh)) {
-                ll_rw_block(WRITE, 1, &bh);
-                wait_on_buffer(bh);
-        }
-}
 #define sb_bread512(sb, sec, data) ({                   \
        struct buffer_head *__bh;                       \
        sector_t __block;                               \
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 397b7adc7ce6..dffb4e996643 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -524,7 +524,7 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
        HFS_I(inode)->rsrc_inode = dir;
        HFS_I(dir)->rsrc_inode = inode;
        igrab(dir);
-        hlist_add_head(&inode->i_hash, &HFS_SB(dir->i_sb)->rsrc_inodes);
+        hlist_add_fake(&inode->i_hash);
        mark_inode_dirty(inode);
 out:
        d_add(dentry, inode);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 86428f5ac991..1563d5ce5764 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -220,7 +220,7 @@ int hfs_mdb_get(struct super_block *sb)
                mdb->drLsMod = hfs_mtime();
                mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
-                hfs_buffer_sync(HFS_SB(sb)->mdb_bh);
+                sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
        }
        return 0;
@@ -287,7 +287,7 @@ void hfs_mdb_commit(struct super_block *sb)
                HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
                HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
                mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
-                hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh);
+                sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
        }
        if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 34235d4bf08b..4824c27cebb8 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -20,7 +20,6 @@
 #include <linux/parser.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include "hfs_fs.h"
@@ -79,15 +78,11 @@ static int hfs_sync_fs(struct super_block *sb, int wait)
 */
 static void hfs_put_super(struct super_block *sb)
 {
-        lock_kernel();
        if (sb->s_dirt)
                hfs_write_super(sb);
        hfs_mdb_close(sb);
        /* release the MDB's resources */
        hfs_mdb_put(sb);
-        unlock_kernel();
 }
 /*
@@ -385,8 +380,8 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;
-        INIT_HLIST_HEAD(&sbi->rsrc_inodes);
        res = -EINVAL;
        if (!parse_options((char *)data, sbi)) {
@@ -446,17 +441,16 @@ bail:
        return res;
 }
-static int hfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *hfs_mount(struct file_system_type *fs_type,
-                      int flags, const char *dev_name, void *data,
+                      int flags, const char *dev_name, void *data)
-                      struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
 }
 static struct file_system_type hfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "hfs",
-        .get_sb         = hfs_get_sb,
+        .mount          = hfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d236d85ec9d7..9d59c0571f59 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -286,7 +286,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
        inc_nlink(inode);
        hfsplus_instantiate(dst_dentry, inode, cnid);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        inode->i_ctime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
        sbi->file_count++;
@@ -317,8 +317,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
                res = hfsplus_rename_cat(inode->i_ino,
                                         dir, &dentry->d_name,
                                         sbi->hidden_dir, &str);
-                if (!res)
+                if (!res) {
                        inode->i_flags |= S_DEAD;
+                        drop_nlink(inode);
+                }
                goto out;
        }
        res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 78449280dae0..8afd7e84f98d 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -211,7 +211,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
         * appear hashed, but do not put on any lists.  hlist_del()
         * will work fine and require no locking.
         */
-        inode->i_hash.pprev = &inode->i_hash.next;
+        hlist_add_fake(&inode->i_hash);
        mark_inode_dirty(inode);
 out:
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 5b4667e08ef7..40a85a3ded6e 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -92,7 +92,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
        mark_inode_dirty(inode);
 out_unlock_inode:
-        mutex_lock(&inode->i_mutex);
+        mutex_unlock(&inode->i_mutex);
 out_drop_write:
        mnt_drop_write(file->f_path.mnt);
 out:
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9a88d7536103..52cc746d3ba3 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -495,18 +495,16 @@ static void hfsplus_destroy_inode(struct inode *inode)
 #define HFSPLUS_INODE_SIZE      sizeof(struct hfsplus_inode_info)
-static int hfsplus_get_sb(struct file_system_type *fs_type,
+static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
-                          int flags, const char *dev_name, void *data,
+                          int flags, const char *dev_name, void *data)
-                          struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
-                           mnt);
 }
 static struct file_system_type hfsplus_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "hfsplus",
-        .get_sb         = hfsplus_get_sb,
+        .mount          = hfsplus_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 6bbd75c5589b..bf15a43016b9 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -28,12 +28,7 @@
 * #define ATTR_KILL_SUID       2048
 * #define ATTR_KILL_SGID       4096
 *
- * and this is because they were added in 2.5 development in this patch:
+ * and this is because they were added in 2.5 development.
- *
- * http://linux.bkbits.net:8080/linux-2.5/
- * cset@3caf4a12k4XgDzK7wyK-TGpSZ9u2Ww?nav=index.html
- * |src/.|src/include|src/include/linux|related/include/linux/fs.h
- *
 * Actually, they are not needed by most ->setattr() methods - they are set by
 * callers of notify_change() to notify that the setuid/setgid bits must be
 * dropped.
@@ -96,7 +91,6 @@ extern int rename_file(char *from, char *to);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
                     long long *bfree_out, long long *bavail_out,
                     long long *files_out, long long *ffree_out,
-                     void *fsid_out, int fsid_size, long *namelen_out,
+                     void *fsid_out, int fsid_size, long *namelen_out);
-                     long *spare_out);
 #endif
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index f7dc9b5f9ef8..2c0f148a49e6 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -217,7 +217,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
        err = do_statfs(dentry->d_sb->s_fs_info,
                        &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
                        &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
-                        &sf->f_namelen, sf->f_spare);
+                        &sf->f_namelen);
        if (err)
                return err;
        sf->f_blocks = f_blocks;
@@ -962,11 +962,11 @@ out:
        return err;
 }
-static int hostfs_read_sb(struct file_system_type *type,
+static struct dentry *hostfs_read_sb(struct file_system_type *type,
                          int flags, const char *dev_name,
-                          void *data, struct vfsmount *mnt)
+                          void *data)
 {
-        return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt);
+        return mount_nodev(type, flags, data, hostfs_fill_sb_common);
 }
 static void hostfs_kill_sb(struct super_block *s)
@@ -978,7 +978,7 @@ static void hostfs_kill_sb(struct super_block *s)
 static struct file_system_type hostfs_type = {
        .owner          = THIS_MODULE,
        .name           = "hostfs",
-        .get_sb         = hostfs_read_sb,
+        .mount          = hostfs_read_sb,
        .kill_sb        = hostfs_kill_sb,
        .fs_flags       = 0,
 };
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 6777aa06ce2c..d51a98384bc0 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -94,8 +94,7 @@ void *open_dir(char *path, int *err_out)
        dir = opendir(path);
        *err_out = errno;
-        if (dir == NULL)
-                return NULL;
        return dir;
 }
@@ -205,7 +204,7 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
        if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
                if (fd >= 0) {
                        if (fchmod(fd, attrs->ia_mode) != 0)
-                                return (-errno);
+                                return -errno;
                } else if (chmod(file, attrs->ia_mode) != 0) {
                        return -errno;
                }
@@ -364,8 +363,7 @@ int rename_file(char *from, char *to)
 int do_statfs(char *root, long *bsize_out, long long *blocks_out,
              long long *bfree_out, long long *bavail_out,
              long long *files_out, long long *ffree_out,
-              void *fsid_out, int fsid_size, long *namelen_out,
+              void *fsid_out, int fsid_size, long *namelen_out)
-              long *spare_out)
 {
        struct statfs64 buf;
        int err;
@@ -384,10 +382,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out,
               sizeof(buf.f_fsid) > fsid_size ? fsid_size :
               sizeof(buf.f_fsid));
        *namelen_out = buf.f_namelen;
-        spare_out[0] = buf.f_spare[0];
-        spare_out[1] = buf.f_spare[1];
-        spare_out[2] = buf.f_spare[2];
-        spare_out[3] = buf.f_spare[3];
-        spare_out[4] = buf.f_spare[4];
        return 0;
 }
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 56bd15c5bf6c..63b6f5632318 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,6 +1,7 @@
 config HPFS_FS
        tristate "OS/2 HPFS file system support"
        depends on BLOCK
+        depends on BKL # nontrivial to fix
        help
          OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
          is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index eac5f96323e3..793cb9d943d2 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -14,7 +14,7 @@ void hpfs_lock_creation(struct super_block *s)
 #ifdef DEBUG_LOCKS
        printk("lock creation\n");
 #endif
-        down(&hpfs_sb(s)->hpfs_creation_de);
+        mutex_lock(&hpfs_sb(s)->hpfs_creation_de);
 }
 void hpfs_unlock_creation(struct super_block *s)
@@ -22,7 +22,7 @@ void hpfs_unlock_creation(struct super_block *s)
 #ifdef DEBUG_LOCKS
        printk("unlock creation\n");
 #endif
-        up(&hpfs_sb(s)->hpfs_creation_de);
+        mutex_unlock(&hpfs_sb(s)->hpfs_creation_de);
 }
 /* Map a sector into a buffer and return pointers to it and to the buffer. */
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b59eac0232a0..2fee17d0d9ab 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -87,7 +87,7 @@ struct hpfs_sb_info {
        unsigned *sb_bmp_dir;           /* main bitmap directory */
        unsigned sb_c_bitmap;           /* current bitmap */
        unsigned sb_max_fwd_alloc;      /* max forwad allocation */
-        struct semaphore hpfs_creation_de; /* when creating dirents, nobody else
+        struct mutex hpfs_creation_de;  /* when creating dirents, nobody else
                                           can alloc blocks */
        /*unsigned sb_mounting : 1;*/
        int sb_timeshift;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 2607010be2fe..6c5f01597c3a 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -477,17 +477,21 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
        int o;
+        lock_kernel();
        save_mount_options(s, options);
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-        if (!sbi)
+        if (!sbi) {
+                unlock_kernel();
                return -ENOMEM;
+        }
        s->s_fs_info = sbi;
        sbi->sb_bmp_dir = NULL;
        sbi->sb_cp_table = NULL;
-        init_MUTEX(&sbi->hpfs_creation_de);
+        mutex_init(&sbi->hpfs_creation_de);
        uid = current_uid();
        gid = current_gid();
@@ -666,6 +670,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
                        root->i_blocks = 5;
                hpfs_brelse4(&qbh);
        }
+        unlock_kernel();
        return 0;
 bail4:  brelse(bh2);
@@ -677,20 +682,20 @@ bail0:
        kfree(sbi->sb_cp_table);
        s->s_fs_info = NULL;
        kfree(sbi);
+        unlock_kernel();
        return -EINVAL;
 }
-static int hpfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *hpfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
-                           mnt);
 }
 static struct file_system_type hpfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "hpfs",
-        .get_sb         = hpfs_get_sb,
+        .mount          = hpfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 7b027720d820..f702b5f713fc 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -598,6 +598,7 @@ static const struct file_operations hppfs_dir_fops = {
        .readdir        = hppfs_readdir,
        .open           = hppfs_dir_open,
        .fsync          = hppfs_fsync,
+        .llseek         = default_llseek,
 };
 static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
@@ -747,17 +748,17 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
        return(err);
 }
-static int hppfs_read_super(struct file_system_type *type,
+static struct dentry *hppfs_read_super(struct file_system_type *type,
                            int flags, const char *dev_name,
-                            void *data, struct vfsmount *mnt)
+                            void *data)
 {
-        return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt);
+        return mount_nodev(type, flags, data, hppfs_fill_super);
 }
 static struct file_system_type hppfs_type = {
        .owner          = THIS_MODULE,
        .name           = "hppfs",
-        .get_sb         = hppfs_read_super,
+        .mount          = hppfs_read_super,
        .kill_sb        = kill_anon_super,
        .fs_flags       = 0,
 };
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6e5bd42f3860..a5fe68189eed 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/magic.h>
+#include <linux/migrate.h>
 #include <asm/uaccess.h>
@@ -455,6 +456,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
        inode = new_inode(sb);
        if (inode) {
                struct hugetlbfs_inode_info *info;
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_uid = uid;
                inode->i_gid = gid;
@@ -573,6 +575,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
        return 0;
 }
+static int hugetlbfs_migrate_page(struct address_space *mapping,
+                                struct page *newpage, struct page *page)
+{
+        int rc;
+        rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+        if (rc)
+                return rc;
+        migrate_page_copy(newpage, page);
+        return 0;
+}
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -659,6 +674,7 @@ static const struct address_space_operations hugetlbfs_aops = {
        .write_begin    = hugetlbfs_write_begin,
        .write_end      = hugetlbfs_write_end,
        .set_page_dirty = hugetlbfs_set_page_dirty,
+        .migratepage    = hugetlbfs_migrate_page,
 };
@@ -674,6 +690,7 @@ const struct file_operations hugetlbfs_file_operations = {
        .mmap                   = hugetlbfs_file_mmap,
        .fsync                  = noop_fsync,
        .get_unmapped_area      = hugetlb_get_unmapped_area,
+        .llseek         = default_llseek,
 };
 static const struct inode_operations hugetlbfs_dir_inode_operations = {
@@ -879,15 +896,15 @@ void hugetlb_put_quota(struct address_space *mapping, long delta)
        }
 }
-static int hugetlbfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
 }
 static struct file_system_type hugetlbfs_fs_type = {
        .name           = "hugetlbfs",
-        .get_sb         = hugetlbfs_get_sb,
+        .mount          = hugetlbfs_mount,
        .kill_sb        = kill_litter_super,
 };
@@ -915,8 +932,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
        if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
                *user = current_user();
                if (user_shm_lock(size, *user)) {
-                        WARN_ONCE(1,
+                        printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
-                          "Using mlock ulimits for SHM_HUGETLB deprecated\n");
                } else {
                        *user = NULL;
                        return ERR_PTR(-EPERM);
diff --git a/fs/inode.c b/fs/inode.c
index 86464332e590..ae2727ab0c3a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -24,11 +24,11 @@
 #include <linux/mount.h>
 #include <linux/async.h>
 #include <linux/posix_acl.h>
+#include <linux/ima.h>
 /*
 * This is needed for the following functions:
 *  - inode_has_buffers
- *  - invalidate_inode_buffers
 *  - invalidate_bdev
 *
 * FIXME: remove all knowledge of the buffer layer from this file
@@ -72,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly;
 * allowing for low-overhead inode sync() operations.
 */
-LIST_HEAD(inode_in_use);
+static LIST_HEAD(inode_lru);
-LIST_HEAD(inode_unused);
 static struct hlist_head *inode_hashtable __read_mostly;
 /*
@@ -103,8 +102,41 @@ static DECLARE_RWSEM(iprune_sem);
 */
 struct inodes_stat_t inodes_stat;
+static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
+static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
 static struct kmem_cache *inode_cachep __read_mostly;
+static inline int get_nr_inodes(void)
+{
+        return percpu_counter_sum_positive(&nr_inodes);
+}
+static inline int get_nr_inodes_unused(void)
+{
+        return percpu_counter_sum_positive(&nr_inodes_unused);
+}
+int get_nr_dirty_inodes(void)
+{
+        int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+        return nr_dirty > 0 ? nr_dirty : 0;
+}
+/*
+ * Handle nr_inode sysctl
+ */
+#ifdef CONFIG_SYSCTL
+int proc_nr_inodes(ctl_table *table, int write,
+                   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        inodes_stat.nr_inodes = get_nr_inodes();
+        inodes_stat.nr_unused = get_nr_inodes_unused();
+        return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
 static void wake_up_inode(struct inode *inode)
 {
        /*
@@ -192,6 +224,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_fsnotify_mask = 0;
 #endif
+        percpu_counter_inc(&nr_inodes);
        return 0;
 out:
        return -ENOMEM;
@@ -232,11 +266,13 @@ void __destroy_inode(struct inode *inode)
        if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
                posix_acl_release(inode->i_default_acl);
 #endif
+        percpu_counter_dec(&nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
-void destroy_inode(struct inode *inode)
+static void destroy_inode(struct inode *inode)
 {
+        BUG_ON(!list_empty(&inode->i_lru));
        __destroy_inode(inode);
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
@@ -255,6 +291,8 @@ void inode_init_once(struct inode *inode)
        INIT_HLIST_NODE(&inode->i_hash);
        INIT_LIST_HEAD(&inode->i_dentry);
        INIT_LIST_HEAD(&inode->i_devices);
+        INIT_LIST_HEAD(&inode->i_wb_list);
+        INIT_LIST_HEAD(&inode->i_lru);
        INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
        spin_lock_init(&inode->i_data.tree_lock);
        spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -281,14 +319,109 @@ static void init_once(void *foo)
 */
 void __iget(struct inode *inode)
 {
-        if (atomic_inc_return(&inode->i_count) != 1)
+        atomic_inc(&inode->i_count);
-                return;
+}
+/*
+ * get additional reference to inode; caller must already hold one.
+ */
+void ihold(struct inode *inode)
+{
+        WARN_ON(atomic_inc_return(&inode->i_count) < 2);
+}
+EXPORT_SYMBOL(ihold);
+static void inode_lru_list_add(struct inode *inode)
+{
+        if (list_empty(&inode->i_lru)) {
+                list_add(&inode->i_lru, &inode_lru);
+                percpu_counter_inc(&nr_inodes_unused);
+        }
+}
-        if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+static void inode_lru_list_del(struct inode *inode)
-                list_move(&inode->i_list, &inode_in_use);
+{
-        inodes_stat.nr_unused--;
+        if (!list_empty(&inode->i_lru)) {
+                list_del_init(&inode->i_lru);
+                percpu_counter_dec(&nr_inodes_unused);
+        }
+}
+static inline void __inode_sb_list_add(struct inode *inode)
+{
+        list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
 }
+/**
+ * inode_sb_list_add - add inode to the superblock list of inodes
+ * @inode: inode to add
+ */
+void inode_sb_list_add(struct inode *inode)
+{
+        spin_lock(&inode_lock);
+        __inode_sb_list_add(inode);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_sb_list_add);
+static inline void __inode_sb_list_del(struct inode *inode)
+{
+        list_del_init(&inode->i_sb_list);
+}
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+        unsigned long tmp;
+        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+                        L1_CACHE_BYTES;
+        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+        return tmp & I_HASHMASK;
+}
+/**
+ *      __insert_inode_hash - hash an inode
+ *      @inode: unhashed inode
+ *      @hashval: unsigned long value used to locate this object in the
+ *              inode_hashtable.
+ *
+ *      Add an inode to the inode hash for this superblock.
+ */
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+        struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
+        spin_lock(&inode_lock);
+        hlist_add_head(&inode->i_hash, b);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(__insert_inode_hash);
+/**
+ *      __remove_inode_hash - remove an inode from the hash
+ *      @inode: inode to unhash
+ *
+ *      Remove an inode from the superblock.
+ */
+static void __remove_inode_hash(struct inode *inode)
+{
+        hlist_del_init(&inode->i_hash);
+}
+/**
+ *      remove_inode_hash - remove an inode from the hash
+ *      @inode: inode to unhash
+ *
+ *      Remove an inode from the superblock.
+ */
+void remove_inode_hash(struct inode *inode)
+{
+        spin_lock(&inode_lock);
+        hlist_del_init(&inode->i_hash);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(remove_inode_hash);
 void end_writeback(struct inode *inode)
 {
        might_sleep();
@@ -327,101 +460,113 @@ static void evict(struct inode *inode)
 */
 static void dispose_list(struct list_head *head)
 {
-        int nr_disposed = 0;
        while (!list_empty(head)) {
                struct inode *inode;
-                inode = list_first_entry(head, struct inode, i_list);
+                inode = list_first_entry(head, struct inode, i_lru);
-                list_del(&inode->i_list);
+                list_del_init(&inode->i_lru);
                evict(inode);
                spin_lock(&inode_lock);
-                hlist_del_init(&inode->i_hash);
+                __remove_inode_hash(inode);
-                list_del_init(&inode->i_sb_list);
+                __inode_sb_list_del(inode);
                spin_unlock(&inode_lock);
                wake_up_inode(inode);
                destroy_inode(inode);
-                nr_disposed++;
        }
-        spin_lock(&inode_lock);
-        inodes_stat.nr_inodes -= nr_disposed;
-        spin_unlock(&inode_lock);
 }
-/*
+/**
- * Invalidate all inodes for a device.
+ * evict_inodes - evict all evictable inodes for a superblock
+ * @sb:         superblock to operate on
+ *
+ * Make sure that no inodes with zero refcount are retained.  This is
+ * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * so any inode reaching zero refcount during or after that call will
+ * be immediately evicted.
 */
-static int invalidate_list(struct list_head *head, struct list_head *dispose)
+void evict_inodes(struct super_block *sb)
 {
-        struct list_head *next;
+        struct inode *inode, *next;
-        int busy = 0, count = 0;
+        LIST_HEAD(dispose);
-        next = head->next;
-        for (;;) {
-                struct list_head *tmp = next;
-                struct inode *inode;
-                /*
+        down_write(&iprune_sem);
-                 * We can reschedule here without worrying about the list's
-                 * consistency because the per-sb list of inodes must not
-                 * change during umount anymore, and because iprune_sem keeps
-                 * shrink_icache_memory() away.
-                 */
-                cond_resched_lock(&inode_lock);
-                next = next->next;
+        spin_lock(&inode_lock);
-                if (tmp == head)
+        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-                        break;
+                if (atomic_read(&inode->i_count))
-                inode = list_entry(tmp, struct inode, i_sb_list);
-                if (inode->i_state & I_NEW)
                        continue;
-                invalidate_inode_buffers(inode);
-                if (!atomic_read(&inode->i_count)) {
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-                        list_move(&inode->i_list, dispose);
+                        WARN_ON(1);
-                        WARN_ON(inode->i_state & I_NEW);
-                        inode->i_state |= I_FREEING;
-                        count++;
                        continue;
                }
-                busy = 1;
+                inode->i_state |= I_FREEING;
+                /*
+                 * Move the inode off the IO lists and LRU once I_FREEING is
+                 * set so that it won't get moved back on there if it is dirty.
+                 */
+                list_move(&inode->i_lru, &dispose);
+                list_del_init(&inode->i_wb_list);
+                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+                        percpu_counter_dec(&nr_inodes_unused);
        }
-        /* only unused inodes may be cached with i_count zero */
+        spin_unlock(&inode_lock);
-        inodes_stat.nr_unused -= count;
-        return busy;
+        dispose_list(&dispose);
+        up_write(&iprune_sem);
 }
 /**
- *      invalidate_inodes       - discard the inodes on a device
+ * invalidate_inodes    - attempt to free all inodes on a superblock
- *      @sb: superblock
+ * @sb:         superblock to operate on
 *
- *      Discard all of the inodes for a given superblock. If the discard
+ * Attempts to free all inodes for a given superblock.  If there were any
- *      fails because there are busy inodes then a non zero value is returned.
+ * busy inodes return a non-zero value, else zero.
- *      If the discard is successful all the inodes have been discarded.
 */
 int invalidate_inodes(struct super_block *sb)
 {
-        int busy;
+        int busy = 0;
-        LIST_HEAD(throw_away);
+        struct inode *inode, *next;
+        LIST_HEAD(dispose);
        down_write(&iprune_sem);
        spin_lock(&inode_lock);
-        fsnotify_unmount_inodes(&sb->s_inodes);
+        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-        busy = invalidate_list(&sb->s_inodes, &throw_away);
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+                        continue;
+                if (atomic_read(&inode->i_count)) {
+                        busy = 1;
+                        continue;
+                }
+                inode->i_state |= I_FREEING;
+                /*
+                 * Move the inode off the IO lists and LRU once I_FREEING is
+                 * set so that it won't get moved back on there if it is dirty.
+                 */
+                list_move(&inode->i_lru, &dispose);
+                list_del_init(&inode->i_wb_list);
+                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+                        percpu_counter_dec(&nr_inodes_unused);
+        }
        spin_unlock(&inode_lock);
-        dispose_list(&throw_away);
+        dispose_list(&dispose);
        up_write(&iprune_sem);
        return busy;
 }
-EXPORT_SYMBOL(invalidate_inodes);
 static int can_unuse(struct inode *inode)
 {
-        if (inode->i_state)
+        if (inode->i_state & ~I_REFERENCED)
                return 0;
        if (inode_has_buffers(inode))
                return 0;
@@ -433,22 +578,24 @@ static int can_unuse(struct inode *inode)
 }
 /*
- * Scan `goal' inodes on the unused list for freeable ones. They are moved to
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * a temporary list and then are freed outside inode_lock by dispose_list().
+ * temporary list and then are freed outside inode_lock by dispose_list().
 *
 * Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed.  We expect the final iput() on that inode to add it to
+ * pagecache removed.  If the inode has metadata buffers attached to
- * the front of the inode_unused list.  So look for it there and if the
+ * mapping->private_list then try to remove them.
- * inode is still freeable, proceed.  The right inode is found 99.9% of the
- * time in testing on a 4-way.
 *
- * If the inode has metadata buffers attached to mapping->private_list then
+ * If the inode has the I_REFERENCED flag set, then it means that it has been
- * try to remove them.
+ * used recently - the flag is set in iput_final(). When we encounter such an
+ * inode, clear the flag and move it to the back of the LRU so it gets another
+ * pass through the LRU before it gets reclaimed. This is necessary because of
+ * the fact we are doing lazy LRU updates to minimise lock contention so the
+ * LRU does not have strict ordering. Hence we don't want to reclaim inodes
+ * with this flag set because they are the inodes that are out of order.
 */
 static void prune_icache(int nr_to_scan)
 {
        LIST_HEAD(freeable);
-        int nr_pruned = 0;
        int nr_scanned;
        unsigned long reap = 0;
@@ -457,13 +604,26 @@ static void prune_icache(int nr_to_scan)
        for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
                struct inode *inode;
-                if (list_empty(&inode_unused))
+                if (list_empty(&inode_lru))
                        break;
-                inode = list_entry(inode_unused.prev, struct inode, i_list);
+                inode = list_entry(inode_lru.prev, struct inode, i_lru);
-                if (inode->i_state || atomic_read(&inode->i_count)) {
+                /*
-                        list_move(&inode->i_list, &inode_unused);
+                 * Referenced or dirty inodes are still in use. Give them
+                 * another pass through the LRU as we canot reclaim them now.
+                 */
+                if (atomic_read(&inode->i_count) ||
+                    (inode->i_state & ~I_REFERENCED)) {
+                        list_del_init(&inode->i_lru);
+                        percpu_counter_dec(&nr_inodes_unused);
+                        continue;
+                }
+                /* recently referenced inodes get one more pass */
+                if (inode->i_state & I_REFERENCED) {
+                        list_move(&inode->i_lru, &inode_lru);
+                        inode->i_state &= ~I_REFERENCED;
                        continue;
                }
                if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@ -475,18 +635,23 @@ static void prune_icache(int nr_to_scan)
                        iput(inode);
                        spin_lock(&inode_lock);
-                        if (inode != list_entry(inode_unused.next,
+                        if (inode != list_entry(inode_lru.next,
-                                                struct inode, i_list))
+                                                struct inode, i_lru))
                                continue;       /* wrong inode or list_empty */
                        if (!can_unuse(inode))
                                continue;
                }
-                list_move(&inode->i_list, &freeable);
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state |= I_FREEING;
-                nr_pruned++;
+                /*
+                 * Move the inode off the IO lists and LRU once I_FREEING is
+                 * set so that it won't get moved back on there if it is dirty.
+                 */
+                list_move(&inode->i_lru, &freeable);
+                list_del_init(&inode->i_wb_list);
+                percpu_counter_dec(&nr_inodes_unused);
        }
-        inodes_stat.nr_unused -= nr_pruned;
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_INODESTEAL, reap);
        else
@@ -518,7 +683,7 @@ static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
                        return -1;
                prune_icache(nr);
        }
-        return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+        return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker icache_shrinker = {
@@ -529,9 +694,6 @@ static struct shrinker icache_shrinker = {
 static void __wait_on_freeing_inode(struct inode *inode);
 /*
 * Called with the inode lock held.
- * NOTE: we are not increasing the inode-refcount, you must call __iget()
- * by hand after calling find_inode now! This simplifies iunique and won't
- * add any additional branch in the common code.
 */
 static struct inode *find_inode(struct super_block *sb,
                                struct hlist_head *head,
@@ -551,9 +713,10 @@ repeat:
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
-                break;
+                __iget(inode);
+                return inode;
        }
-        return node ? inode : NULL;
+        return NULL;
 }
 /*
@@ -576,53 +739,49 @@ repeat:
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
-                break;
+                __iget(inode);
+                return inode;
        }
-        return node ? inode : NULL;
+        return NULL;
-}
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
-        unsigned long tmp;
-        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-                        L1_CACHE_BYTES;
-        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-        return tmp & I_HASHMASK;
-}
-static inline void
-__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
-                        struct inode *inode)
-{
-        inodes_stat.nr_inodes++;
-        list_add(&inode->i_list, &inode_in_use);
-        list_add(&inode->i_sb_list, &sb->s_inodes);
-        if (head)
-                hlist_add_head(&inode->i_hash, head);
 }
-/**
+/*
- * inode_add_to_lists - add a new inode to relevant lists
+ * Each cpu owns a range of LAST_INO_BATCH numbers.
- * @sb: superblock inode belongs to
+ * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
- * @inode: inode to mark in use
+ * to renew the exhausted range.
 *
- * When an inode is allocated it needs to be accounted for, added to the in use
+ * This does not significantly increase overflow rate because every CPU can
- * list, the owning superblock and the inode hash. This needs to be done under
+ * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
- * the inode_lock, so export a function to do this rather than the inode lock
+ * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
- * itself. We calculate the hash list to add to here so it is all internal
+ * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
- * which requires the caller to have already set up the inode number in the
+ * overflow rate by 2x, which does not seem too significant.
- * inode to add.
+ *
+ * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+ * error if st_ino won't fit in target struct field. Use 32bit counter
+ * here to attempt to avoid that.
 */
-void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+#define LAST_INO_BATCH 1024
+static DEFINE_PER_CPU(unsigned int, last_ino);
+unsigned int get_next_ino(void)
 {
-        struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+        unsigned int *p = &get_cpu_var(last_ino);
+        unsigned int res = *p;
-        spin_lock(&inode_lock);
+#ifdef CONFIG_SMP
-        __inode_add_to_lists(sb, head, inode);
+        if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
-        spin_unlock(&inode_lock);
+                static atomic_t shared_last_ino;
+                int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
+                res = next - LAST_INO_BATCH;
+        }
+#endif
+        *p = ++res;
+        put_cpu_var(last_ino);
+        return res;
 }
-EXPORT_SYMBOL_GPL(inode_add_to_lists);
+EXPORT_SYMBOL(get_next_ino);
 /**
 *      new_inode       - obtain an inode
@@ -638,12 +797,6 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
 */
 struct inode *new_inode(struct super_block *sb)
 {
-        /*
-         * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
-         * error if st_ino won't fit in target struct field. Use 32bit counter
-         * here to attempt to avoid that.
-         */
-        static unsigned int last_ino;
        struct inode *inode;
        spin_lock_prefetch(&inode_lock);
@@ -651,8 +804,7 @@ struct inode *new_inode(struct super_block *sb)
        inode = alloc_inode(sb);
        if (inode) {
                spin_lock(&inode_lock);
-                __inode_add_to_lists(sb, NULL, inode);
+                __inode_sb_list_add(inode);
-                inode->i_ino = ++last_ino;
                inode->i_state = 0;
                spin_unlock(&inode_lock);
        }
@@ -663,7 +815,7 @@ EXPORT_SYMBOL(new_inode);
 void unlock_new_inode(struct inode *inode)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-        if (inode->i_mode & S_IFDIR) {
+        if (S_ISDIR(inode->i_mode)) {
                struct file_system_type *type = inode->i_sb->s_type;
                /* Set new key only if filesystem hasn't already changed it */
@@ -720,7 +872,8 @@ static struct inode *get_new_inode(struct super_block *sb,
                        if (set(inode, data))
                                goto set_failed;
-                        __inode_add_to_lists(sb, head, inode);
+                        hlist_add_head(&inode->i_hash, head);
+                        __inode_sb_list_add(inode);
                        inode->i_state = I_NEW;
                        spin_unlock(&inode_lock);
@@ -735,7 +888,6 @@ static struct inode *get_new_inode(struct super_block *sb,
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
-                __iget(old);
                spin_unlock(&inode_lock);
                destroy_inode(inode);
                inode = old;
@@ -767,7 +919,8 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
-                        __inode_add_to_lists(sb, head, inode);
+                        hlist_add_head(&inode->i_hash, head);
+                        __inode_sb_list_add(inode);
                        inode->i_state = I_NEW;
                        spin_unlock(&inode_lock);
@@ -782,7 +935,6 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
-                __iget(old);
                spin_unlock(&inode_lock);
                destroy_inode(inode);
                inode = old;
@@ -791,6 +943,27 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
        return inode;
 }
+/*
+ * search the inode cache for a matching inode number.
+ * If we find one, then the inode number we are trying to
+ * allocate is not unique and so we should not use it.
+ *
+ * Returns 1 if the inode number is unique, 0 if it is not.
+ */
+static int test_inode_iunique(struct super_block *sb, unsigned long ino)
+{
+        struct hlist_head *b = inode_hashtable + hash(sb, ino);
+        struct hlist_node *node;
+        struct inode *inode;
+        hlist_for_each_entry(inode, node, b, i_hash) {
+                if (inode->i_ino == ino && inode->i_sb == sb)
+                        return 0;
+        }
+        return 1;
+}
 /**
 *      iunique - get a unique inode number
 *      @sb: superblock
@@ -812,19 +985,18 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
         * error if st_ino won't fit in target struct field. Use 32bit counter
         * here to attempt to avoid that.
         */
+        static DEFINE_SPINLOCK(iunique_lock);
        static unsigned int counter;
-        struct inode *inode;
-        struct hlist_head *head;
        ino_t res;
        spin_lock(&inode_lock);
+        spin_lock(&iunique_lock);
        do {
                if (counter <= max_reserved)
                        counter = max_reserved + 1;
                res = counter++;
-                head = inode_hashtable + hash(sb, res);
+        } while (!test_inode_iunique(sb, res));
-                inode = find_inode_fast(sb, head, res);
+        spin_unlock(&iunique_lock);
-        } while (inode != NULL);
        spin_unlock(&inode_lock);
        return res;
@@ -876,7 +1048,6 @@ static struct inode *ifind(struct super_block *sb,
        spin_lock(&inode_lock);
        inode = find_inode(sb, head, test, data);
        if (inode) {
-                __iget(inode);
                spin_unlock(&inode_lock);
                if (likely(wait))
                        wait_on_inode(inode);
@@ -909,7 +1080,6 @@ static struct inode *ifind_fast(struct super_block *sb,
        spin_lock(&inode_lock);
        inode = find_inode_fast(sb, head, ino);
        if (inode) {
-                __iget(inode);
                spin_unlock(&inode_lock);
                wait_on_inode(inode);
                return inode;
@@ -1095,7 +1265,7 @@ int insert_inode_locked(struct inode *inode)
                __iget(old);
                spin_unlock(&inode_lock);
                wait_on_inode(old);
-                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
@@ -1134,7 +1304,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
                __iget(old);
                spin_unlock(&inode_lock);
                wait_on_inode(old);
-                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
@@ -1143,36 +1313,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 }
 EXPORT_SYMBOL(insert_inode_locked4);
-/**
- *      __insert_inode_hash - hash an inode
- *      @inode: unhashed inode
- *      @hashval: unsigned long value used to locate this object in the
- *              inode_hashtable.
- *
- *      Add an inode to the inode hash for this superblock.
- */
-void __insert_inode_hash(struct inode *inode, unsigned long hashval)
-{
-        struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
-        spin_lock(&inode_lock);
-        hlist_add_head(&inode->i_hash, head);
-        spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL(__insert_inode_hash);
-/**
- *      remove_inode_hash - remove an inode from the hash
- *      @inode: inode to unhash
- *
- *      Remove an inode from the superblock.
- */
-void remove_inode_hash(struct inode *inode)
-{
-        spin_lock(&inode_lock);
-        hlist_del_init(&inode->i_hash);
-        spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL(remove_inode_hash);
 int generic_delete_inode(struct inode *inode)
 {
@@ -1187,7 +1327,7 @@ EXPORT_SYMBOL(generic_delete_inode);
 */
 int generic_drop_inode(struct inode *inode)
 {
-        return !inode->i_nlink || hlist_unhashed(&inode->i_hash);
+        return !inode->i_nlink || inode_unhashed(inode);
 }
 EXPORT_SYMBOL_GPL(generic_drop_inode);
@@ -1213,10 +1353,11 @@ static void iput_final(struct inode *inode)
                drop = generic_drop_inode(inode);
        if (!drop) {
-                if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-                        list_move(&inode->i_list, &inode_unused);
-                inodes_stat.nr_unused++;
                if (sb->s_flags & MS_ACTIVE) {
+                        inode->i_state |= I_REFERENCED;
+                        if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
+                                inode_lru_list_add(inode);
+                        }
                        spin_unlock(&inode_lock);
                        return;
                }
@@ -1227,19 +1368,23 @@ static void iput_final(struct inode *inode)
                spin_lock(&inode_lock);
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state &= ~I_WILL_FREE;
-                inodes_stat.nr_unused--;
+                __remove_inode_hash(inode);
-                hlist_del_init(&inode->i_hash);
        }
-        list_del_init(&inode->i_list);
-        list_del_init(&inode->i_sb_list);
        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
-        inodes_stat.nr_inodes--;
+        /*
+         * Move the inode off the IO lists and LRU once I_FREEING is
+         * set so that it won't get moved back on there if it is dirty.
+         */
+        inode_lru_list_del(inode);
+        list_del_init(&inode->i_wb_list);
+        __inode_sb_list_del(inode);
        spin_unlock(&inode_lock);
        evict(inode);
-        spin_lock(&inode_lock);
+        remove_inode_hash(inode);
-        hlist_del_init(&inode->i_hash);
-        spin_unlock(&inode_lock);
        wake_up_inode(inode);
        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
        destroy_inode(inode);
@@ -1503,6 +1648,8 @@ void __init inode_init(void)
                                         SLAB_MEM_SPREAD),
                                         init_once);
        register_shrinker(&icache_shrinker);
+        percpu_counter_init(&nr_inodes, 0);
+        percpu_counter_init(&nr_inodes_unused, 0);
        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h
index a6910e91cee8..e43b9a4dbf4e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -101,3 +101,10 @@ extern void put_super(struct super_block *sb);
 struct nameidata;
 extern struct file *nameidata_to_filp(struct nameidata *);
 extern void release_open_intent(struct nameidata *);
+/*
+ * inode.c
+ */
+extern int get_nr_dirty_inodes(void);
+extern void evict_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index f855ea4fc888..e92fdbb3bc3a 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -530,6 +530,41 @@ static int ioctl_fsthaw(struct file *filp)
        return thaw_super(sb);
 }
+static int ioctl_fstrim(struct file *filp, void __user *argp)
+{
+        struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+        struct fstrim_range range;
+        int ret = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        /* If filesystem doesn't support trim feature, return. */
+        if (sb->s_op->trim_fs == NULL)
+                return -EOPNOTSUPP;
+        /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
+        if (sb->s_bdev == NULL)
+                return -EINVAL;
+        if (argp == NULL) {
+                range.start = 0;
+                range.len = ULLONG_MAX;
+                range.minlen = 0;
+        } else if (copy_from_user(&range, argp, sizeof(range)))
+                return -EFAULT;
+        ret = sb->s_op->trim_fs(sb, &range);
+        if (ret < 0)
+                return ret;
+        if ((argp != NULL) &&
+            (copy_to_user(argp, &range, sizeof(range))))
+                return -EFAULT;
+        return 0;
+}
 /*
 * When you add any new common ioctls to the switches above and below
 * please update compat_sys_ioctl() too.
@@ -580,6 +615,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                error = ioctl_fsthaw(filp);
                break;
+        case FITRIM:
+                error = ioctl_fstrim(filp, argp);
+                break;
        case FS_IOC_FIEMAP:
                return ioctl_fiemap(filp, arg);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 748cfb92dcc6..2f7d05c89922 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -111,12 +111,14 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
        read_lock(&tasklist_lock);
        switch (which) {
                case IOPRIO_WHO_PROCESS:
+                        rcu_read_lock();
                        if (!who)
                                p = current;
                        else
                                p = find_task_by_vpid(who);
                        if (p)
                                ret = set_task_ioprio(p, ioprio);
+                        rcu_read_unlock();
                        break;
                case IOPRIO_WHO_PGRP:
                        if (!who)
@@ -139,7 +141,12 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
                                break;
                        do_each_thread(g, p) {
-                                if (__task_cred(p)->uid != who)
+                                int match;
+                                rcu_read_lock();
+                                match = __task_cred(p)->uid == who;
+                                rcu_read_unlock();
+                                if (!match)
                                        continue;
                                ret = set_task_ioprio(p, ioprio);
                                if (ret)
@@ -200,12 +207,14 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
        read_lock(&tasklist_lock);
        switch (which) {
                case IOPRIO_WHO_PROCESS:
+                        rcu_read_lock();
                        if (!who)
                                p = current;
                        else
                                p = find_task_by_vpid(who);
                        if (p)
                                ret = get_task_ioprio(p);
+                        rcu_read_unlock();
                        break;
                case IOPRIO_WHO_PGRP:
                        if (!who)
@@ -232,7 +241,12 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
                                break;
                        do_each_thread(g, p) {
-                                if (__task_cred(p)->uid != user->uid)
+                                int match;
+                                rcu_read_lock();
+                                match = __task_cred(p)->uid == user->uid;
+                                rcu_read_unlock();
+                                if (!match)
                                        continue;
                                tmpio = get_task_ioprio(p);
                                if (tmpio < 0)
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index e0aca9a0ac68..0542b6eedf80 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -10,7 +10,6 @@
 *
 *  isofs directory handling functions
 */
-#include <linux/smp_lock.h>
 #include <linux/gfp.h>
 #include "isofs.h"
@@ -255,18 +254,19 @@ static int isofs_readdir(struct file *filp,
        char *tmpname;
        struct iso_directory_record *tmpde;
        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
        tmpname = (char *)__get_free_page(GFP_KERNEL);
        if (tmpname == NULL)
                return -ENOMEM;
-        lock_kernel();
+        mutex_lock(&sbi->s_mutex);
        tmpde = (struct iso_directory_record *) (tmpname+1024);
        result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
        free_page((unsigned long) tmpname);
-        unlock_kernel();
+        mutex_unlock(&sbi->s_mutex);
        return result;
 }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 5a44811b5027..bfdeb82a53be 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -17,7 +17,6 @@
 #include <linux/slab.h>
 #include <linux/nls.h>
 #include <linux/ctype.h>
-#include <linux/smp_lock.h>
 #include <linux/statfs.h>
 #include <linux/cdrom.h>
 #include <linux/parser.h>
@@ -44,11 +43,7 @@ static void isofs_put_super(struct super_block *sb)
        struct isofs_sb_info *sbi = ISOFS_SB(sb);
 #ifdef CONFIG_JOLIET
-        lock_kernel();
        unload_nls(sbi->s_nls_iocharset);
-        unlock_kernel();
 #endif
        kfree(sbi);
@@ -549,6 +544,34 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
 }
 /*
+ * Check if root directory is empty (has less than 3 files).
+ *
+ * Used to detect broken CDs where ISO root directory is empty but Joliet root
+ * directory is OK. If such CD has Rock Ridge extensions, they will be disabled
+ * (and Joliet used instead) or else no files would be visible.
+ */
+static bool rootdir_empty(struct super_block *sb, unsigned long block)
+{
+        int offset = 0, files = 0, de_len;
+        struct iso_directory_record *de;
+        struct buffer_head *bh;
+        bh = sb_bread(sb, block);
+        if (!bh)
+                return true;
+        while (files < 3) {
+                de = (struct iso_directory_record *) (bh->b_data + offset);
+                de_len = *(unsigned char *) de;
+                if (de_len == 0)
+                        break;
+                files++;
+                offset += de_len;
+        }
+        brelse(bh);
+        return files < 3;
+}
+/*
 * Initialize the superblock and read the root inode.
 *
 * Note: a check_disk_change() has been done immediately prior
@@ -823,6 +846,7 @@ root_found:
        sbi->s_utf8 = opt.utf8;
        sbi->s_nocompress = opt.nocompress;
        sbi->s_overriderockperm = opt.overriderockperm;
+        mutex_init(&sbi->s_mutex);
        /*
         * It would be incredibly stupid to allow people to mark every file
         * on the disk as suid, so we merely allow them to set the default
@@ -847,6 +871,18 @@ root_found:
                goto out_no_root;
        /*
+         * Fix for broken CDs with Rock Ridge and empty ISO root directory but
+         * correct Joliet root directory.
+         */
+        if (sbi->s_rock == 1 && joliet_level &&
+                                rootdir_empty(s, sbi->s_firstdatazone)) {
+                printk(KERN_NOTICE
+                        "ISOFS: primary root directory is empty. "
+                        "Disabling Rock Ridge and switching to Joliet.");
+                sbi->s_rock = 0;
+        }
+        /*
         * If this disk has both Rock Ridge and Joliet on it, then we
         * want to use Rock Ridge by default.  This can be overridden
         * by using the norock mount option.  There is still one other
@@ -966,27 +1002,23 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 * or getblk() if they are not.  Returns the number of blocks inserted
 * (-ve == error.)
 */
-int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
+int isofs_get_blocks(struct inode *inode, sector_t iblock,
                     struct buffer_head **bh, unsigned long nblocks)
 {
-        unsigned long b_off;
+        unsigned long b_off = iblock;
        unsigned offset, sect_size;
        unsigned int firstext;
        unsigned long nextblk, nextoff;
-        long iblock = (long)iblock_s;
        int section, rv, error;
        struct iso_inode_info *ei = ISOFS_I(inode);
-        lock_kernel();
        error = -EIO;
        rv = 0;
-        if (iblock < 0 || iblock != iblock_s) {
+        if (iblock != b_off) {
                printk(KERN_DEBUG "%s: block number too large\n", __func__);
                goto abort;
        }
-        b_off = iblock;
        offset = 0;
        firstext = ei->i_first_extent;
@@ -1004,8 +1036,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
                 * I/O errors.
                 */
                if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
-                        printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n",
+                        printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
-                                __func__, iblock, (unsigned long) inode->i_size);
+                                __func__, b_off,
+                                (unsigned long long)inode->i_size);
                        goto abort;
                }
@@ -1031,9 +1064,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
                        if (++section > 100) {
                                printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
                                        " aborting...\n", __func__);
-                                printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u "
+                                printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
                                        "nextblk=%lu nextoff=%lu\n", __func__,
-                                        iblock, firstext, (unsigned) sect_size,
+                                        b_off, firstext, (unsigned) sect_size,
                                        nextblk, nextoff);
                                goto abort;
                        }
@@ -1054,7 +1087,6 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
        error = 0;
 abort:
-        unlock_kernel();
        return rv != 0 ? rv : error;
 }
@@ -1475,17 +1507,16 @@ struct inode *isofs_iget(struct super_block *sb,
        return inode;
 }
-static int isofs_get_sb(struct file_system_type *fs_type,
+static struct dentry *isofs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
-                                mnt);
 }
 static struct file_system_type iso9660_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "iso9660",
-        .get_sb         = isofs_get_sb,
+        .mount          = isofs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 7d33de84f52a..2882dc089f87 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -55,6 +55,7 @@ struct isofs_sb_info {
        gid_t s_gid;
        uid_t s_uid;
        struct nls_table *s_nls_iocharset; /* Native language support table */
+        struct mutex s_mutex; /* replaces BKL, please remove if possible */
 };
 #define ISOFS_INVALID_MODE ((mode_t) -1)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index ab438beb867c..0d23abfd4280 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -6,7 +6,6 @@
 *  (C) 1991  Linus Torvalds - minix filesystem
 */
-#include <linux/smp_lock.h>
 #include <linux/gfp.h>
 #include "isofs.h"
@@ -168,6 +167,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
        int found;
        unsigned long uninitialized_var(block);
        unsigned long uninitialized_var(offset);
+        struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
        struct inode *inode;
        struct page *page;
@@ -177,7 +177,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
        if (!page)
                return ERR_PTR(-ENOMEM);
-        lock_kernel();
+        mutex_lock(&sbi->s_mutex);
        found = isofs_find_entry(dir, dentry,
                                &block, &offset,
                                page_address(page),
@@ -188,10 +188,10 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
        if (found) {
                inode = isofs_iget(dir->i_sb, block, offset);
                if (IS_ERR(inode)) {
-                        unlock_kernel();
+                        mutex_unlock(&sbi->s_mutex);
                        return ERR_CAST(inode);
                }
        }
-        unlock_kernel();
+        mutex_unlock(&sbi->s_mutex);
        return d_splice_alias(inode, dentry);
 }
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 96a685c550fd..f9cd04db6eab 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -8,7 +8,6 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include "isofs.h"
 #include "rock.h"
@@ -661,6 +660,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
 {
        struct inode *inode = page->mapping->host;
        struct iso_inode_info *ei = ISOFS_I(inode);
+        struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
        char *link = kmap(page);
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
        struct buffer_head *bh;
@@ -673,12 +673,12 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
        struct rock_state rs;
        int ret;
-        if (!ISOFS_SB(inode->i_sb)->s_rock)
+        if (!sbi->s_rock)
                goto error;
        init_rock_state(&rs, inode);
        block = ei->i_iget5_block;
-        lock_kernel();
+        mutex_lock(&sbi->s_mutex);
        bh = sb_bread(inode->i_sb, block);
        if (!bh)
                goto out_noread;
@@ -748,7 +748,7 @@ repeat:
                goto fail;
        brelse(bh);
        *rpnt = '\0';
-        unlock_kernel();
+        mutex_unlock(&sbi->s_mutex);
        SetPageUptodate(page);
        kunmap(page);
        unlock_page(page);
@@ -765,7 +765,7 @@ out_bad_span:
        printk("symlink spans iso9660 blocks\n");
 fail:
        brelse(bh);
-        unlock_kernel();
+        mutex_unlock(&sbi->s_mutex);
 error:
        SetPageError(page);
        kunmap(page);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 05a38b9c4c0e..e4b87bc1fa56 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -221,7 +221,7 @@ restart:
                        goto restart;
                }
                if (buffer_locked(bh)) {
-                        atomic_inc(&bh->b_count);
+                        get_bh(bh);
                        spin_unlock(&journal->j_list_lock);
                        jbd_unlock_bh_state(bh);
                        wait_on_buffer(bh);
@@ -283,7 +283,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
        int ret = 0;
        if (buffer_locked(bh)) {
-                atomic_inc(&bh->b_count);
+                get_bh(bh);
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
                wait_on_buffer(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 95d8c11c929e..34a4861c14b8 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -137,34 +137,10 @@ static int journal_write_commit_record(journal_t *journal,
        JBUFFER_TRACE(descriptor, "write commit block");
        set_buffer_dirty(bh);
-        if (journal->j_flags & JFS_BARRIER) {
+        if (journal->j_flags & JFS_BARRIER)
-                ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
+                ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
+        else
-                /*
-                 * Is it possible for another commit to fail at roughly
-                 * the same time as this one?  If so, we don't want to
-                 * trust the barrier flag in the super, but instead want
-                 * to remember if we sent a barrier request
-                 */
-                if (ret == -EOPNOTSUPP) {
-                        char b[BDEVNAME_SIZE];
-                        printk(KERN_WARNING
-                                "JBD: barrier-based sync failed on %s - "
-                                "disabling barriers\n",
-                                bdevname(journal->j_dev, b));
-                        spin_lock(&journal->j_state_lock);
-                        journal->j_flags &= ~JFS_BARRIER;
-                        spin_unlock(&journal->j_state_lock);
-                        /* And try again, without the barrier */
-                        set_buffer_uptodate(bh);
-                        set_buffer_dirty(bh);
-                        ret = sync_dirty_buffer(bh);
-                }
-        } else {
                ret = sync_dirty_buffer(bh);
-        }
        put_bh(bh);             /* One for getblk() */
        journal_put_journal_head(descriptor);
@@ -318,7 +294,7 @@ void journal_commit_transaction(journal_t *journal)
        int first_tag = 0;
        int tag_flag;
        int i;
-        int write_op = WRITE;
+        int write_op = WRITE_SYNC;
        /*
         * First job: lock down the current transaction and wait for
@@ -611,13 +587,13 @@ void journal_commit_transaction(journal_t *journal)
                /* Bump b_count to prevent truncate from stumbling over
                   the shadowed buffer!  @@@ This can go if we ever get
                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
-                atomic_inc(&jh2bh(jh)->b_count);
+                get_bh(jh2bh(jh));
                /* Make a temporary IO buffer with which to write it out
                   (this will requeue both the metadata buffer and the
                   temporary IO buffer). new_bh goes on BJ_IO*/
-                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
+                set_buffer_jwrite(jh2bh(jh));
                /*
                 * akpm: journal_write_metadata_buffer() sets
                 * new_bh->b_transaction to commit_transaction.
@@ -627,7 +603,7 @@ void journal_commit_transaction(journal_t *journal)
                JBUFFER_TRACE(jh, "ph3: write metadata");
                flags = journal_write_metadata_buffer(commit_transaction,
                                                      jh, &new_jh, blocknr);
-                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+                set_buffer_jwrite(jh2bh(new_jh));
                wbuf[bufs++] = jh2bh(new_jh);
                /* Record the new block's tag in the current descriptor
@@ -737,7 +713,7 @@ wait_for_iobuf:
                   shadowed buffer */
                jh = commit_transaction->t_shadow_list->b_tprev;
                bh = jh2bh(jh);
-                clear_bit(BH_JWrite, &bh->b_state);
+                clear_buffer_jwrite(bh);
                J_ASSERT_BH(bh, buffer_jbddirty(bh));
                /* The metadata is now released for reuse, but we need
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 2c4b1f109da9..da1b5e4ffce1 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -36,6 +36,7 @@
 #include <linux/poison.h>
 #include <linux/proc_fs.h>
 #include <linux/debugfs.h>
+#include <linux/ratelimit.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -84,6 +85,7 @@ EXPORT_SYMBOL(journal_force_commit);
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
+static const char *journal_dev_name(journal_t *journal, char *buffer);
 /*
 * Helper function used to manage commit timeouts
@@ -439,7 +441,7 @@ int __log_start_commit(journal_t *journal, tid_t target)
         */
        if (!tid_geq(journal->j_commit_request, target)) {
                /*
-                 * We want a new commit: OK, mark the request and wakup the
+                 * We want a new commit: OK, mark the request and wakeup the
                 * commit thread.  We do _not_ do the commit ourselves.
                 */
@@ -950,6 +952,8 @@ int journal_create(journal_t *journal)
                if (err)
                        return err;
                bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+                if (unlikely(!bh))
+                        return -ENOMEM;
                lock_buffer(bh);
                memset (bh->b_data, 0, journal->j_blocksize);
                BUFFER_TRACE(bh, "marking dirty");
@@ -1010,6 +1014,23 @@ void journal_update_superblock(journal_t *journal, int wait)
                goto out;
        }
+        if (buffer_write_io_error(bh)) {
+                char b[BDEVNAME_SIZE];
+                /*
+                 * Oh, dear.  A previous attempt to write the journal
+                 * superblock failed.  This could happen because the
+                 * USB device was yanked out.  Or it could happen to
+                 * be a transient write error and maybe the block will
+                 * be remapped.  Nothing we can do but to retry the
+                 * write and hope for the best.
+                 */
+                printk(KERN_ERR "JBD: previous I/O error detected "
+                       "for journal superblock update for %s.\n",
+                       journal_dev_name(journal, b));
+                clear_buffer_write_io_error(bh);
+                set_buffer_uptodate(bh);
+        }
        spin_lock(&journal->j_state_lock);
        jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
                  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1021,9 +1042,17 @@ void journal_update_superblock(journal_t *journal, int wait)
        BUFFER_TRACE(bh, "marking dirty");
        mark_buffer_dirty(bh);
-        if (wait)
+        if (wait) {
                sync_dirty_buffer(bh);
-        else
+                if (buffer_write_io_error(bh)) {
+                        char b[BDEVNAME_SIZE];
+                        printk(KERN_ERR "JBD: I/O error detected "
+                               "when updating journal superblock for %s.\n",
+                               journal_dev_name(journal, b));
+                        clear_buffer_write_io_error(bh);
+                        set_buffer_uptodate(bh);
+                }
+        } else
                write_dirty_buffer(bh, WRITE);
 out:
@@ -1719,7 +1748,6 @@ static void journal_destroy_journal_head_cache(void)
 static struct journal_head *journal_alloc_journal_head(void)
 {
        struct journal_head *ret;
-        static unsigned long last_warning;
 #ifdef CONFIG_JBD_DEBUG
        atomic_inc(&nr_journal_heads);
@@ -1727,11 +1755,9 @@ static struct journal_head *journal_alloc_journal_head(void)
        ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
        if (ret == NULL) {
                jbd_debug(1, "out of memory for journal_head\n");
-                if (time_after(jiffies, last_warning + 5*HZ)) {
+                printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
-                        printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
+                                   __func__);
-                               __func__);
-                        last_warning = jiffies;
-                }
                while (ret == NULL) {
                        yield();
                        ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 81051dafebf5..5b43e96788e6 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -296,10 +296,10 @@ int journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD_DEBUG
                int dropped = info.end_transaction -
                              be32_to_cpu(journal->j_superblock->s_sequence);
-#endif
                jbd_debug(1,
                          "JBD: ignoring %d transaction%s from the journal.\n",
                          dropped, (dropped == 1) ? "" : "s");
+#endif
                journal->j_transaction_sequence = ++info.end_transaction;
        }
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 5ae71e75a491..846a3f314111 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -293,9 +293,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
                jbd_free_handle(handle);
                current->journal_info = NULL;
                handle = ERR_PTR(err);
-                goto out;
        }
-out:
        return handle;
 }
@@ -528,7 +526,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
        transaction = handle->h_transaction;
        journal = transaction->t_journal;
-        jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+        jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
        JBUFFER_TRACE(jh, "entry");
 repeat:
@@ -713,7 +711,7 @@ done:
                J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
                            "Possible IO failure.\n");
                page = jh2bh(jh)->b_page;
-                offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+                offset = offset_in_page(jh2bh(jh)->b_data);
                source = kmap_atomic(page, KM_USER0);
                memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
                kunmap_atomic(source, KM_USER0);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5247e7ffdcb4..6a79fd0a1a32 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -299,6 +299,16 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                transaction->t_chp_stats.cs_forced_to_close++;
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
+                if (unlikely(journal->j_flags & JBD2_UNMOUNT))
+                        /*
+                         * The journal thread is dead; so starting and
+                         * waiting for a commit to finish will cause
+                         * us to wait for a _very_ long time.
+                         */
+                        printk(KERN_ERR "JBD2: %s: "
+                               "Waiting for Godot: block %llu\n",
+                               journal->j_devname,
+                               (unsigned long long) bh->b_blocknr);
                jbd2_log_start_commit(journal, tid);
                jbd2_log_wait_commit(journal, tid);
                ret = 1;
@@ -532,8 +542,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
         */
        if ((journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
-                        BLKDEV_IFL_WAIT);
        if (!(journal->j_flags & JBD2_ABORT))
                jbd2_journal_update_superblock(journal, 1);
        return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7c068c189d80..f3ad1598b201 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -26,7 +26,9 @@
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/bitops.h>
 #include <trace/events/jbd2.h>
+#include <asm/system.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -134,25 +136,11 @@ static int journal_submit_commit_record(journal_t *journal,
        if (journal->j_flags & JBD2_BARRIER &&
            !JBD2_HAS_INCOMPAT_FEATURE(journal,
-                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-                ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
+                ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
-                if (ret == -EOPNOTSUPP) {
+        else
-                        printk(KERN_WARNING
-                               "JBD2: Disabling barriers on %s, "
-                               "not supported by device\n", journal->j_devname);
-                        write_lock(&journal->j_state_lock);
-                        journal->j_flags &= ~JBD2_BARRIER;
-                        write_unlock(&journal->j_state_lock);
-                        /* And try again, without the barrier */
-                        lock_buffer(bh);
-                        set_buffer_uptodate(bh);
-                        clear_buffer_dirty(bh);
-                        ret = submit_bh(WRITE_SYNC_PLUG, bh);
-                }
-        } else {
                ret = submit_bh(WRITE_SYNC_PLUG, bh);
-        }
        *cbh = bh;
        return ret;
 }
@@ -166,29 +154,8 @@ static int journal_wait_on_commit_record(journal_t *journal,
 {
        int ret = 0;
-retry:
        clear_buffer_dirty(bh);
        wait_on_buffer(bh);
-        if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
-                printk(KERN_WARNING
-                       "JBD2: %s: disabling barries on %s - not supported "
-                       "by device\n", __func__, journal->j_devname);
-                write_lock(&journal->j_state_lock);
-                journal->j_flags &= ~JBD2_BARRIER;
-                write_unlock(&journal->j_state_lock);
-                lock_buffer(bh);
-                clear_buffer_dirty(bh);
-                set_buffer_uptodate(bh);
-                bh->b_end_io = journal_end_buffer_io_sync;
-                ret = submit_bh(WRITE_SYNC_PLUG, bh);
-                if (ret) {
-                        unlock_buffer(bh);
-                        return ret;
-                }
-                goto retry;
-        }
        if (unlikely(!buffer_uptodate(bh)))
                ret = -EIO;
@@ -236,7 +203,7 @@ static int journal_submit_data_buffers(journal_t *journal,
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
                mapping = jinode->i_vfs_inode->i_mapping;
-                jinode->i_flags |= JI_COMMIT_RUNNING;
+                set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                spin_unlock(&journal->j_list_lock);
                /*
                 * submit the inode data buffers. We use writepage
@@ -251,7 +218,8 @@ static int journal_submit_data_buffers(journal_t *journal,
                spin_lock(&journal->j_list_lock);
                J_ASSERT(jinode->i_transaction == commit_transaction);
                commit_transaction->t_flushed_data_blocks = 1;
-                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+                smp_mb__after_clear_bit();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
        spin_unlock(&journal->j_list_lock);
@@ -272,7 +240,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
        /* For locking, see the comment in journal_submit_data_buffers() */
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-                jinode->i_flags |= JI_COMMIT_RUNNING;
+                set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                spin_unlock(&journal->j_list_lock);
                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
                if (err) {
@@ -288,7 +256,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
                                ret = err;
                }
                spin_lock(&journal->j_list_lock);
-                jinode->i_flags &= ~JI_COMMIT_RUNNING;
+                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+                smp_mb__after_clear_bit();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
@@ -360,7 +329,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        int tag_bytes = journal_tag_bytes(journal);
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
-        int write_op = WRITE;
+        int write_op = WRITE_SYNC;
        /*
         * First job: lock down the current transaction and wait for
@@ -701,6 +670,16 @@ start_journal_io:
                }
        }
+        err = journal_finish_inode_data_buffers(journal, commit_transaction);
+        if (err) {
+                printk(KERN_WARNING
+                        "JBD2: Detected IO errors while flushing file data "
+                       "on %s\n", journal->j_devname);
+                if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+                        jbd2_journal_abort(journal, err);
+                err = 0;
+        }
        /* 
         * If the journal is not located on the file system device,
         * then we must flush the file system device before we issue
@@ -709,8 +688,7 @@ start_journal_io:
        if (commit_transaction->t_flushed_data_blocks &&
            (journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
-                        BLKDEV_IFL_WAIT);
        /* Done it all: now write the commit record asynchronously. */
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -719,19 +697,6 @@ start_journal_io:
                                                 &cbh, crc32_sum);
                if (err)
                        __jbd2_journal_abort_hard(journal);
-                if (journal->j_flags & JBD2_BARRIER)
-                        blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
-                                BLKDEV_IFL_WAIT);
-        }
-        err = journal_finish_inode_data_buffers(journal, commit_transaction);
-        if (err) {
-                printk(KERN_WARNING
-                        "JBD2: Detected IO errors while flushing file data "
-                       "on %s\n", journal->j_devname);
-                if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
-                        jbd2_journal_abort(journal, err);
-                err = 0;
        }
        /* Lo and behold: we have just managed to send a transaction to
@@ -845,6 +810,11 @@ wait_for_iobuf:
        }
        if (!err && !is_journal_aborted(journal))
                err = journal_wait_on_commit_record(journal, cbh);
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+            journal->j_flags & JBD2_BARRIER) {
+                blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
+        }
        if (err)
                jbd2_journal_abort(journal, err);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 262419f83d80..c590d155c095 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -42,12 +42,14 @@
 #include <linux/log2.h>
 #include <linux/vmalloc.h>
 #include <linux/backing-dev.h>
+#include <linux/bitops.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
+#include <asm/system.h>
 EXPORT_SYMBOL(jbd2_journal_extend);
 EXPORT_SYMBOL(jbd2_journal_stop);
@@ -478,7 +480,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
         */
        if (!tid_geq(journal->j_commit_request, target)) {
                /*
-                 * We want a new commit: OK, mark the request and wakup the
+                 * We want a new commit: OK, mark the request and wakeup the
                 * commit thread.  We do _not_ do the commit ourselves.
                 */
@@ -1836,7 +1838,6 @@ size_t journal_tag_bytes(journal_t *journal)
 */
 #define JBD2_MAX_SLABS 8
 static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
-static DECLARE_MUTEX(jbd2_slab_create_sem);
 static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
        "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
@@ -1857,6 +1858,7 @@ static void jbd2_journal_destroy_slabs(void)
 static int jbd2_journal_create_slab(size_t size)
 {
+        static DEFINE_MUTEX(jbd2_slab_create_mutex);
        int i = order_base_2(size) - 10;
        size_t slab_size;
@@ -1868,16 +1870,16 @@ static int jbd2_journal_create_slab(size_t size)
        if (unlikely(i < 0))
                i = 0;
-        down(&jbd2_slab_create_sem);
+        mutex_lock(&jbd2_slab_create_mutex);
        if (jbd2_slab[i]) {
-                up(&jbd2_slab_create_sem);
+                mutex_unlock(&jbd2_slab_create_mutex);
                return 0;       /* Already created */
        }
        slab_size = 1 << (i+10);
        jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
                                         slab_size, 0, NULL);
-        up(&jbd2_slab_create_sem);
+        mutex_unlock(&jbd2_slab_create_mutex);
        if (!jbd2_slab[i]) {
                printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
                return -ENOMEM;
@@ -2210,7 +2212,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal,
 restart:
        spin_lock(&journal->j_list_lock);
        /* Is commit writing out inode - we have to wait */
-        if (jinode->i_flags & JI_COMMIT_RUNNING) {
+        if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
                wait_queue_head_t *wq;
                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index f3479d6e0a83..6bf0a242613e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -156,6 +156,7 @@ alloc_transaction:
         */
 repeat:
        read_lock(&journal->j_state_lock);
+        BUG_ON(journal->j_flags & JBD2_UNMOUNT);
        if (is_journal_aborted(journal) ||
            (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
                read_unlock(&journal->j_state_lock);
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index a906f538d11c..85c6be2db02f 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -23,7 +23,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
 static inline struct jffs2_inode_cache *
 first_inode_chain(int *i, struct jffs2_sb_info *c)
 {
-        for (; *i < INOCACHE_HASHSIZE; (*i)++) {
+        for (; *i < c->inocache_hashsize; (*i)++) {
                if (c->inocache_list[*i])
                        return c->inocache_list[*i];
        }
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 617a1e5694c1..de4247021d25 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -103,7 +103,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                        spin_unlock(&jffs2_compressor_list_lock);
                        *datalen  = orig_slen;
                        *cdatalen = orig_dlen;
-                        compr_ret = this->compress(data_in, output_buf, datalen, cdatalen, NULL);
+                        compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
                        spin_lock(&jffs2_compressor_list_lock);
                        this->usecount--;
                        if (!compr_ret) {
@@ -152,7 +152,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                        spin_unlock(&jffs2_compressor_list_lock);
                        *datalen  = orig_slen;
                        *cdatalen = orig_dlen;
-                        compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen, NULL);
+                        compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen);
                        spin_lock(&jffs2_compressor_list_lock);
                        this->usecount--;
                        if (!compr_ret) {
@@ -220,7 +220,7 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                        if (comprtype == this->compr) {
                                this->usecount++;
                                spin_unlock(&jffs2_compressor_list_lock);
-                                ret = this->decompress(cdata_in, data_out, cdatalen, datalen, NULL);
+                                ret = this->decompress(cdata_in, data_out, cdatalen, datalen);
                                spin_lock(&jffs2_compressor_list_lock);
                                if (ret) {
                                        printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret);
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index e471a9106fd9..13bb7597ab39 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -49,9 +49,9 @@ struct jffs2_compressor {
        char *name;
        char compr;                     /* JFFS2_COMPR_XXX */
        int (*compress)(unsigned char *data_in, unsigned char *cpage_out,
-                        uint32_t *srclen, uint32_t *destlen, void *model);
+                        uint32_t *srclen, uint32_t *destlen);
        int (*decompress)(unsigned char *cdata_in, unsigned char *data_out,
-                          uint32_t cdatalen, uint32_t datalen, void *model);
+                          uint32_t cdatalen, uint32_t datalen);
        int usecount;
        int disabled;           /* if set the compressor won't compress */
        unsigned char *compr_buf;       /* used by size compr. mode */
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index ed25ae7c98eb..af186ee674d8 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -42,7 +42,7 @@ static int __init alloc_workspace(void)
 }
 static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
-                              uint32_t *sourcelen, uint32_t *dstlen, void *model)
+                              uint32_t *sourcelen, uint32_t *dstlen)
 {
        size_t compress_size;
        int ret;
@@ -67,7 +67,7 @@ static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
 }
 static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out,
-                                 uint32_t srclen, uint32_t destlen, void *model)
+                                 uint32_t srclen, uint32_t destlen)
 {
        size_t dl = destlen;
        int ret;
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 9696ad9ef5f7..16a5047903a6 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -31,8 +31,7 @@
 /* _compress returns the compressed size, -1 if bigger */
 static int jffs2_rtime_compress(unsigned char *data_in,
                                unsigned char *cpage_out,
-                                uint32_t *sourcelen, uint32_t *dstlen,
+                                uint32_t *sourcelen, uint32_t *dstlen)
-                                void *model)
 {
        short positions[256];
        int outpos = 0;
@@ -73,8 +72,7 @@ static int jffs2_rtime_compress(unsigned char *data_in,
 static int jffs2_rtime_decompress(unsigned char *data_in,
                                  unsigned char *cpage_out,
-                                  uint32_t srclen, uint32_t destlen,
+                                  uint32_t srclen, uint32_t destlen)
-                                  void *model)
 {
        short positions[256];
        int outpos = 0;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index a12b4f763373..9e7cec808c4c 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -298,7 +298,7 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
 #if 0
 /* _compress returns the compressed size, -1 if bigger */
 int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
-                   uint32_t *sourcelen, uint32_t *dstlen, void *model)
+                   uint32_t *sourcelen, uint32_t *dstlen)
 {
        return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
                                 cpage_out, sourcelen, dstlen);
@@ -306,8 +306,7 @@ int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
 #endif
 static int jffs2_dynrubin_compress(unsigned char *data_in,
                                   unsigned char *cpage_out,
-                                   uint32_t *sourcelen, uint32_t *dstlen,
+                                   uint32_t *sourcelen, uint32_t *dstlen)
-                                   void *model)
 {
        int bits[8];
        unsigned char histo[256];
@@ -387,8 +386,7 @@ static void rubin_do_decompress(int bit_divider, int *bits,
 static int jffs2_rubinmips_decompress(unsigned char *data_in,
                                      unsigned char *cpage_out,
-                                      uint32_t sourcelen, uint32_t dstlen,
+                                      uint32_t sourcelen, uint32_t dstlen)
-                                      void *model)
 {
        rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
                            cpage_out, sourcelen, dstlen);
@@ -397,8 +395,7 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
 static int jffs2_dynrubin_decompress(unsigned char *data_in,
                                     unsigned char *cpage_out,
-                                     uint32_t sourcelen, uint32_t dstlen,
+                                     uint32_t sourcelen, uint32_t dstlen)
-                                     void *model)
 {
        int bits[8];
        int c;
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 97fc45de6f81..fd05a0b9431d 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -68,8 +68,7 @@ static void free_workspaces(void)
 static int jffs2_zlib_compress(unsigned char *data_in,
                               unsigned char *cpage_out,
-                               uint32_t *sourcelen, uint32_t *dstlen,
+                               uint32_t *sourcelen, uint32_t *dstlen)
-                               void *model)
 {
        int ret;
@@ -136,8 +135,7 @@ static int jffs2_zlib_compress(unsigned char *data_in,
 static int jffs2_zlib_decompress(unsigned char *data_in,
                                 unsigned char *cpage_out,
-                                 uint32_t srclen, uint32_t destlen,
+                                 uint32_t srclen, uint32_t destlen)
-                                 void *model)
 {
        int ret;
        int wbits = MAX_WBITS;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index ed78a3cf3cb0..92978658ed18 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -289,7 +289,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
                mutex_unlock(&f->sem);
                d_instantiate(dentry, old_dentry->d_inode);
                dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
-                atomic_inc(&old_dentry->d_inode->i_count);
+                ihold(old_dentry->d_inode);
        }
        return ret;
 }
@@ -367,7 +367,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        }
        /* We use f->target field to store the target path. */
-        f->target = kmalloc(targetlen + 1, GFP_KERNEL);
+        f->target = kmemdup(target, targetlen + 1, GFP_KERNEL);
        if (!f->target) {
                printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
                mutex_unlock(&f->sem);
@@ -376,7 +376,6 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
                goto fail;
        }
-        memcpy(f->target, target, targetlen + 1);
        D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target));
        /* No data here. Only a metadata node, which will be
@@ -864,7 +863,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
                printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret);
                /* Might as well let the VFS know */
                d_instantiate(new_dentry, old_dentry->d_inode);
-                atomic_inc(&old_dentry->d_inode->i_count);
+                ihold(old_dentry->d_inode);
                new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
                return ret;
        }
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index abac961f617b..e513f1913c15 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -151,7 +151,7 @@ int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
                }
                /* Be nice */
-                yield();
+                cond_resched();
                mutex_lock(&c->erase_free_sem);
                spin_lock(&c->erase_completion_lock);
        }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 6b2964a19850..e896e67767eb 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -21,7 +21,6 @@
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 #include "nodelist.h"
 static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -391,7 +390,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
           This also catches the case where it was stopped and this
           is just a remount to restart it.
           Flush the writebuffer, if neccecary, else we loose it */
-        lock_kernel();
        if (!(sb->s_flags & MS_RDONLY)) {
                jffs2_stop_garbage_collect_thread(c);
                mutex_lock(&c->alloc_sem);
@@ -403,8 +401,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
                jffs2_start_garbage_collect_thread(c);
        *flags |= MS_NOATIME;
-        unlock_kernel();
        return 0;
 }
@@ -478,6 +474,25 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
        return inode;
 }
+static int calculate_inocache_hashsize(uint32_t flash_size)
+{
+        /*
+         * Pick a inocache hash size based on the size of the medium.
+         * Count how many megabytes we're dealing with, apply a hashsize twice
+         * that size, but rounding down to the usual big powers of 2. And keep
+         * to sensible bounds.
+         */
+        int size_mb = flash_size / 1024 / 1024;
+        int hashsize = (size_mb * 2) & ~0x3f;
+        if (hashsize < INOCACHE_HASHSIZE_MIN)
+                return INOCACHE_HASHSIZE_MIN;
+        if (hashsize > INOCACHE_HASHSIZE_MAX)
+                return INOCACHE_HASHSIZE_MAX;
+        return hashsize;
+}
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 {
@@ -524,7 +539,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
        if (ret)
                return ret;
-        c->inocache_list = kcalloc(INOCACHE_HASHSIZE, sizeof(struct jffs2_inode_cache *), GFP_KERNEL);
+        c->inocache_hashsize = calculate_inocache_hashsize(c->flash_size);
+        c->inocache_list = kcalloc(c->inocache_hashsize, sizeof(struct jffs2_inode_cache *), GFP_KERNEL);
        if (!c->inocache_list) {
                ret = -ENOMEM;
                goto out_wbuf;
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 846a79452497..31dce611337c 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -219,13 +219,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
        if (!list_empty(&c->erase_complete_list) ||
            !list_empty(&c->erase_pending_list)) {
                spin_unlock(&c->erase_completion_lock);
+                mutex_unlock(&c->alloc_sem);
                D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
-                if (jffs2_erase_pending_blocks(c, 1)) {
+                if (jffs2_erase_pending_blocks(c, 1))
-                        mutex_unlock(&c->alloc_sem);
                        return 0;
-                }
                D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
                spin_lock(&c->erase_completion_lock);
+                mutex_lock(&c->alloc_sem);
        }
        /* First, work out which block we're garbage-collecting */
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 6784bc89add1..f864005de64c 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -100,6 +100,7 @@ struct jffs2_sb_info {
        wait_queue_head_t erase_wait;           /* For waiting for erases to complete */
        wait_queue_head_t inocache_wq;
+        int inocache_hashsize;
        struct jffs2_inode_cache **inocache_list;
        spinlock_t inocache_lock;
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index af02bd138469..5e03233c2363 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -420,7 +420,7 @@ struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t
 {
        struct jffs2_inode_cache *ret;
-        ret = c->inocache_list[ino % INOCACHE_HASHSIZE];
+        ret = c->inocache_list[ino % c->inocache_hashsize];
        while (ret && ret->ino < ino) {
                ret = ret->next;
        }
@@ -441,7 +441,7 @@ void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new
        dbg_inocache("add %p (ino #%u)\n", new, new->ino);
-        prev = &c->inocache_list[new->ino % INOCACHE_HASHSIZE];
+        prev = &c->inocache_list[new->ino % c->inocache_hashsize];
        while ((*prev) && (*prev)->ino < new->ino) {
                prev = &(*prev)->next;
@@ -462,7 +462,7 @@ void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old)
        dbg_inocache("del %p (ino #%u)\n", old, old->ino);
        spin_lock(&c->inocache_lock);
-        prev = &c->inocache_list[old->ino % INOCACHE_HASHSIZE];
+        prev = &c->inocache_list[old->ino % c->inocache_hashsize];
        while ((*prev) && (*prev)->ino < old->ino) {
                prev = &(*prev)->next;
@@ -487,7 +487,7 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c)
        int i;
        struct jffs2_inode_cache *this, *next;
-        for (i=0; i<INOCACHE_HASHSIZE; i++) {
+        for (i=0; i < c->inocache_hashsize; i++) {
                this = c->inocache_list[i];
                while (this) {
                        next = this->next;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 523a91691052..5a53d9bdb2b5 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -199,7 +199,8 @@ struct jffs2_inode_cache {
 #define RAWNODE_CLASS_XATTR_DATUM       1
 #define RAWNODE_CLASS_XATTR_REF         2
-#define INOCACHE_HASHSIZE 128
+#define INOCACHE_HASHSIZE_MIN 128
+#define INOCACHE_HASHSIZE_MAX 1024
 #define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size)
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 46f870d1cc36..b632dddcb482 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -20,7 +20,7 @@
 #include "summary.h"
 #include "debug.h"
-#define DEFAULT_EMPTY_SCAN_SIZE 1024
+#define DEFAULT_EMPTY_SCAN_SIZE 256
 #define noisy_printk(noise, args...) do { \
        if (*(noise)) { \
@@ -435,7 +435,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
                                  unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
        struct jffs2_unknown_node *node;
        struct jffs2_unknown_node crcnode;
-        uint32_t ofs, prevofs;
+        uint32_t ofs, prevofs, max_ofs;
        uint32_t hdr_crc, buf_ofs, buf_len;
        int err;
        int noise = 0;
@@ -550,12 +550,12 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
        /* We temporarily use 'ofs' as a pointer into the buffer/jeb */
        ofs = 0;
+        max_ofs = EMPTY_SCAN_SIZE(c->sector_size);
-        /* Scan only 4KiB of 0xFF before declaring it's empty */
+        /* Scan only EMPTY_SCAN_SIZE of 0xFF before declaring it's empty */
-        while(ofs < EMPTY_SCAN_SIZE(c->sector_size) && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF)
+        while(ofs < max_ofs && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF)
                ofs += 4;
-        if (ofs == EMPTY_SCAN_SIZE(c->sector_size)) {
+        if (ofs == max_ofs) {
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
                if (jffs2_cleanmarker_oob(c)) {
                        /* scan oob, take care of cleanmarker */
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 662bba099501..c86041b866a4 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/fs.h>
@@ -146,6 +145,7 @@ static const struct super_operations jffs2_super_operations =
 static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct jffs2_sb_info *c;
+        int ret;
        D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():"
                  " New superblock for device %d (\"%s\")\n",
@@ -175,15 +175,15 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
        sb->s_flags |= MS_POSIXACL;
 #endif
-        return jffs2_do_fill_super(sb, data, silent);
+        ret = jffs2_do_fill_super(sb, data, silent);
+        return ret;
 }
-static int jffs2_get_sb(struct file_system_type *fs_type,
+static struct dentry *jffs2_mount(struct file_system_type *fs_type,
                        int flags, const char *dev_name,
-                        void *data, struct vfsmount *mnt)
+                        void *data)
 {
-        return get_sb_mtd(fs_type, flags, dev_name, data, jffs2_fill_super,
+        return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super);
-                          mnt);
 }
 static void jffs2_put_super (struct super_block *sb)
@@ -192,8 +192,6 @@ static void jffs2_put_super (struct super_block *sb)
        D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
-        lock_kernel();
        if (sb->s_dirt)
                jffs2_write_super(sb);
@@ -215,8 +213,6 @@ static void jffs2_put_super (struct super_block *sb)
        if (c->mtd->sync)
                c->mtd->sync(c->mtd);
-        unlock_kernel();
        D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
@@ -232,7 +228,7 @@ static void jffs2_kill_sb(struct super_block *sb)
 static struct file_system_type jffs2_fs_type = {
        .owner =        THIS_MODULE,
        .name =         "jffs2",
-        .get_sb =       jffs2_get_sb,
+        .mount =        jffs2_mount,
        .kill_sb =      jffs2_kill_sb,
 };
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f8332dc8eeb2..3a09423b6c22 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -497,7 +497,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
         * appear hashed, but do not put on any lists.  hlist_del()
         * will work fine and require no locking.
         */
-        ip->i_hash.pprev = &ip->i_hash.next;
+        hlist_add_fake(&ip->i_hash);
        return (ip);
 }
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index c51af2a14516..e1b8493b9aaa 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1010,15 +1010,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
                 * option 2 - shutdown file systems
                 *            associated with log ?
                 * option 3 - extend log ?
-                 */
-                /*
                 * option 4 - second chance
                 *
                 * mark log wrapped, and continue.
                 * when all active transactions are completed,
-                 * mark log vaild for recovery.
+                 * mark log valid for recovery.
                 * if crashed during invalid state, log state
-                 * implies invald log, forcing fsck().
+                 * implies invalid log, forcing fsck().
                 */
                /* mark log state log wrap in log superblock */
                /* log->state = LOGWRAP; */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 7b698f2ec45a..9895595fd2f2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -97,7 +97,7 @@ int jfs_mount(struct super_block *sb)
        ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
        if (ipaimap == NULL) {
-                jfs_err("jfs_mount: Faild to read AGGREGATE_I");
+                jfs_err("jfs_mount: Failed to read AGGREGATE_I");
                rc = -EIO;
                goto errout20;
        }
@@ -148,7 +148,7 @@ int jfs_mount(struct super_block *sb)
        if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
                ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
                if (!ipaimap2) {
-                        jfs_err("jfs_mount: Faild to read AGGREGATE_I");
+                        jfs_err("jfs_mount: Failed to read AGGREGATE_I");
                        rc = -EIO;
                        goto errout35;
                }
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d945ea76b445..9466957ec841 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1279,7 +1279,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
         * lazy commit thread finishes processing
         */
        if (tblk->xflag & COMMIT_DELETE) {
-                atomic_inc(&tblk->u.ip->i_count);
+                ihold(tblk->u.ip);
                /*
                 * Avoid a rare deadlock
                 *
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a9cf8e8675be..231ca4af9bce 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -839,7 +839,7 @@ static int jfs_link(struct dentry *old_dentry,
        ip->i_ctime = CURRENT_TIME;
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        mark_inode_dirty(dir);
-        atomic_inc(&ip->i_count);
+        ihold(ip);
        iplist[0] = ip;
        iplist[1] = dir;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index ec8c3e4baca3..0669fc1cc3bf 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -33,7 +33,6 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -176,8 +175,6 @@ static void jfs_put_super(struct super_block *sb)
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
-        lock_kernel();
        rc = jfs_umount(sb);
        if (rc)
                jfs_err("jfs_umount failed with return code %d", rc);
@@ -188,8 +185,6 @@ static void jfs_put_super(struct super_block *sb)
        iput(sbi->direct_inode);
        kfree(sbi);
-        unlock_kernel();
 }
 enum {
@@ -369,19 +364,16 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
        if (!parse_options(data, sb, &newLVSize, &flag)) {
                return -EINVAL;
        }
-        lock_kernel();
        if (newLVSize) {
                if (sb->s_flags & MS_RDONLY) {
                        printk(KERN_ERR
                  "JFS: resize requires volume to be mounted read-write\n");
-                        unlock_kernel();
                        return -EROFS;
                }
                rc = jfs_extendfs(sb, newLVSize, 0);
-                if (rc) {
+                if (rc)
-                        unlock_kernel();
                        return rc;
-                }
        }
        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -397,36 +389,30 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
                /* mark the fs r/w for quota activity */
                sb->s_flags &= ~MS_RDONLY;
-                unlock_kernel();
                dquot_resume(sb, -1);
                return ret;
        }
        if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
                rc = dquot_suspend(sb, -1);
                if (rc < 0) {
-                        unlock_kernel();
                        return rc;
                }
                rc = jfs_umount_rw(sb);
                JFS_SBI(sb)->flag = flag;
-                unlock_kernel();
                return rc;
        }
        if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
                if (!(sb->s_flags & MS_RDONLY)) {
                        rc = jfs_umount_rw(sb);
-                        if (rc) {
+                        if (rc)
-                                unlock_kernel();
                                return rc;
-                        }
                        JFS_SBI(sb)->flag = flag;
                        ret = jfs_mount_rw(sb, 1);
-                        unlock_kernel();
                        return ret;
                }
        JFS_SBI(sb)->flag = flag;
-        unlock_kernel();
        return 0;
 }
@@ -446,6 +432,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;
        sbi->sb = sb;
        sbi->uid = sbi->gid = sbi->umask = -1;
@@ -596,11 +583,10 @@ static int jfs_unfreeze(struct super_block *sb)
        return 0;
 }
-static int jfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
-                           mnt);
 }
 static int jfs_sync_fs(struct super_block *sb, int wait)
@@ -783,7 +769,7 @@ static const struct export_operations jfs_export_operations = {
 static struct file_system_type jfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "jfs",
-        .get_sb         = jfs_get_sb,
+        .mount          = jfs_do_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/libfs.c b/fs/libfs.c
index 62baa0387d6e..a3accdf528ad 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -201,9 +201,8 @@ static const struct super_operations simple_super_operations = {
 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
 * will never be mountable)
 */
-int get_sb_pseudo(struct file_system_type *fs_type, char *name,
+struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
-        const struct super_operations *ops, unsigned long magic,
+        const struct super_operations *ops, unsigned long magic)
-        struct vfsmount *mnt)
 {
        struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
        struct dentry *dentry;
@@ -211,7 +210,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
        struct qstr d_name = {.name = name, .len = strlen(name)};
        if (IS_ERR(s))
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        s->s_flags = MS_NOUSER;
        s->s_maxbytes = MAX_LFS_FILESIZE;
@@ -241,12 +240,11 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
        d_instantiate(dentry, root);
        s->s_root = dentry;
        s->s_flags |= MS_ACTIVE;
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 Enomem:
        deactivate_locked_super(s);
-        return -ENOMEM;
+        return ERR_PTR(-ENOMEM);
 }
 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
@@ -255,7 +253,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        dget(dentry);
        d_instantiate(dentry, inode);
        return 0;
@@ -892,10 +890,6 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent);
 */
 int generic_file_fsync(struct file *file, int datasync)
 {
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 0, /* metadata-only; caller takes care of data */
-        };
        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;
@@ -906,7 +900,7 @@ int generic_file_fsync(struct file *file, int datasync)
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                return ret;
-        err = sync_inode(inode, &wbc);
+        err = sync_inode_metadata(inode, 1);
        if (ret == 0)
                ret = err;
        return ret;
@@ -955,7 +949,7 @@ EXPORT_SYMBOL(dcache_dir_lseek);
 EXPORT_SYMBOL(dcache_dir_open);
 EXPORT_SYMBOL(dcache_readdir);
 EXPORT_SYMBOL(generic_read_dir);
-EXPORT_SYMBOL(get_sb_pseudo);
+EXPORT_SYMBOL(mount_pseudo);
 EXPORT_SYMBOL(simple_write_begin);
 EXPORT_SYMBOL(simple_write_end);
 EXPORT_SYMBOL(simple_dir_inode_operations);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 64fd427c993c..d5bb86866e6c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -42,6 +42,7 @@ struct nlm_wait {
 };
 static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
 /**
 * nlmclnt_init - Set up per-NFS mount point lockd data structures
@@ -97,7 +98,10 @@ struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *
                block->b_lock = fl;
                init_waitqueue_head(&block->b_wait);
                block->b_status = nlm_lck_blocked;
+                spin_lock(&nlm_blocked_lock);
                list_add(&block->b_list, &nlm_blocked);
+                spin_unlock(&nlm_blocked_lock);
        }
        return block;
 }
@@ -106,7 +110,9 @@ void nlmclnt_finish_block(struct nlm_wait *block)
 {
        if (block == NULL)
                return;
+        spin_lock(&nlm_blocked_lock);
        list_del(&block->b_list);
+        spin_unlock(&nlm_blocked_lock);
        kfree(block);
 }
@@ -154,6 +160,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
         * Look up blocked request based on arguments. 
         * Warning: must not use cookie to match it!
         */
+        spin_lock(&nlm_blocked_lock);
        list_for_each_entry(block, &nlm_blocked, b_list) {
                struct file_lock *fl_blocked = block->b_lock;
@@ -178,6 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
                wake_up(&block->b_wait);
                res = nlm_granted;
        }
+        spin_unlock(&nlm_blocked_lock);
        return res;
 }
@@ -216,10 +224,6 @@ reclaimer(void *ptr)
        allow_signal(SIGKILL);
        down_write(&host->h_rwsem);
-        /* This one ensures that our parent doesn't terminate while the
-         * reclaim is in progress */
-        lock_kernel();
        lockd_up();     /* note: this cannot fail as lockd is already running */
        dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
@@ -260,16 +264,17 @@ restart:
        dprintk("NLM: done reclaiming locks for host %s\n", host->h_name);
        /* Now, wake up all processes that sleep on a blocked lock */
+        spin_lock(&nlm_blocked_lock);
        list_for_each_entry(block, &nlm_blocked, b_list) {
                if (block->b_host == host) {
                        block->b_status = nlm_lck_denied_grace_period;
                        wake_up(&block->b_wait);
                }
        }
+        spin_unlock(&nlm_blocked_lock);
        /* Release host handle after use */
        nlm_release_host(host);
        lockd_down();
-        unlock_kernel();
        return 0;
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 7932c399fab4..47ea1e1925b8 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -166,7 +166,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
        /* Set up the argument struct */
        nlmclnt_setlockargs(call, fl);
-        lock_kernel();
        if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
                if (fl->fl_type != F_UNLCK) {
                        call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -177,10 +176,8 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
                status = nlmclnt_test(call, fl);
        else
                status = -EINVAL;
        fl->fl_ops->fl_release_private(fl);
        fl->fl_ops = NULL;
-        unlock_kernel();
        dprintk("lockd: clnt proc returns %d\n", status);
        return status;
@@ -226,9 +223,7 @@ void nlm_release_call(struct nlm_rqst *call)
 static void nlmclnt_rpc_release(void *data)
 {
-        lock_kernel();
        nlm_release_call(data);
-        unlock_kernel();
 }
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -448,14 +443,18 @@ out:
 static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
 {
+        spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
        new->fl_u.nfs_fl.state = fl->fl_u.nfs_fl.state;
        new->fl_u.nfs_fl.owner = nlm_get_lockowner(fl->fl_u.nfs_fl.owner);
        list_add_tail(&new->fl_u.nfs_fl.list, &fl->fl_u.nfs_fl.owner->host->h_granted);
+        spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
 }
 static void nlmclnt_locks_release_private(struct file_lock *fl)
 {
+        spin_lock(&fl->fl_u.nfs_fl.owner->host->h_lock);
        list_del(&fl->fl_u.nfs_fl.list);
+        spin_unlock(&fl->fl_u.nfs_fl.owner->host->h_lock);
        nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
 }
@@ -721,9 +720,7 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 die:
        return;
 retry_rebind:
-        lock_kernel();
        nlm_rebind_host(req->a_host);
-        unlock_kernel();
 retry_unlock:
        rpc_restart_call(task);
 }
@@ -801,9 +798,7 @@ retry_cancel:
        /* Don't ever retry more than 3 times */
        if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
                goto die;
-        lock_kernel();
        nlm_rebind_host(req->a_host);
-        unlock_kernel();
        rpc_restart_call(task);
        rpc_delay(task, 30 * HZ);
 }
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index bb464d12104c..25e21e4023b2 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -353,6 +353,7 @@ nlm_bind_host(struct nlm_host *host)
                        .to_retries     = 5U,
                };
                struct rpc_create_args args = {
+                        .net            = &init_net,
                        .protocol       = host->h_proto,
                        .address        = nlm_addr(host),
                        .addrsize       = host->h_addrlen,
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e3015464fbab..e0c918949644 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -69,6 +69,7 @@ static struct rpc_clnt *nsm_create(void)
                .sin_addr.s_addr        = htonl(INADDR_LOOPBACK),
        };
        struct rpc_create_args args = {
+                .net                    = &init_net,
                .protocol               = XPRT_TRANSPORT_UDP,
                .address                = (struct sockaddr *)&sin,
                .addrsize               = sizeof(sin),
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index f1bacf1a0391..abfff9d7979d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -22,7 +22,6 @@
 #include <linux/in.h>
 #include <linux/uio.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/mutex.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -130,15 +129,6 @@ lockd(void *vrqstp)
        dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
-        /*
-         * FIXME: it would be nice if lockd didn't spend its entire life
-         * running under the BKL. At the very least, it would be good to
-         * have someone clarify what it's intended to protect here. I've
-         * seen some handwavy posts about posix locking needing to be
-         * done under the BKL, but it's far from clear.
-         */
-        lock_kernel();
        if (!nlm_timeout)
                nlm_timeout = LOCKD_DFLT_TIMEO;
        nlmsvc_timeout = nlm_timeout * HZ;
@@ -195,7 +185,6 @@ lockd(void *vrqstp)
        if (nlmsvc_ops)
                nlmsvc_invalidate_all();
        nlm_shutdown_hosts();
-        unlock_kernel();
        return 0;
 }
@@ -206,7 +195,7 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
        xprt = svc_find_xprt(serv, name, family, 0);
        if (xprt == NULL)
-                return svc_create_xprt(serv, name, family, port,
+                return svc_create_xprt(serv, name, &init_net, family, port,
                                                SVC_SOCK_DEFAULTS);
        svc_xprt_put(xprt);
        return 0;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 031c6569a134..a336e832475d 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -230,9 +230,7 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 static void nlm4svc_callback_release(void *data)
 {
-        lock_kernel();
        nlm_release_call(data);
-        unlock_kernel();
 }
 static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 84055d31bfc5..c462d346acbd 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -52,12 +52,13 @@ static const struct rpc_call_ops nlmsvc_grant_ops;
 * The list of blocked locks to retry
 */
 static LIST_HEAD(nlm_blocked);
+static DEFINE_SPINLOCK(nlm_blocked_lock);
 /*
 * Insert a blocked lock into the global list
 */
 static void
-nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
+nlmsvc_insert_block_locked(struct nlm_block *block, unsigned long when)
 {
        struct nlm_block *b;
        struct list_head *pos;
@@ -87,6 +88,13 @@ nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
        block->b_when = when;
 }
+static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
+{
+        spin_lock(&nlm_blocked_lock);
+        nlmsvc_insert_block_locked(block, when);
+        spin_unlock(&nlm_blocked_lock);
+}
 /*
 * Remove a block from the global list
 */
@@ -94,7 +102,9 @@ static inline void
 nlmsvc_remove_block(struct nlm_block *block)
 {
        if (!list_empty(&block->b_list)) {
+                spin_lock(&nlm_blocked_lock);
                list_del_init(&block->b_list);
+                spin_unlock(&nlm_blocked_lock);
                nlmsvc_release_block(block);
        }
 }
@@ -651,7 +661,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
        struct nlm_block *block;
        int rc = -ENOENT;
-        lock_kernel();
+        spin_lock(&nlm_blocked_lock);
        list_for_each_entry(block, &nlm_blocked, b_list) {
                if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
                        dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n",
@@ -665,13 +675,13 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
                        } else if (result == 0)
                                block->b_granted = 1;
-                        nlmsvc_insert_block(block, 0);
+                        nlmsvc_insert_block_locked(block, 0);
                        svc_wake_up(block->b_daemon);
                        rc = 0;
                        break;
                }
        }
-        unlock_kernel();
+        spin_unlock(&nlm_blocked_lock);
        if (rc == -ENOENT)
                printk(KERN_WARNING "lockd: grant for unknown block\n");
        return rc;
@@ -690,14 +700,16 @@ nlmsvc_notify_blocked(struct file_lock *fl)
        struct nlm_block        *block;
        dprintk("lockd: VFS unblock notification for block %p\n", fl);
+        spin_lock(&nlm_blocked_lock);
        list_for_each_entry(block, &nlm_blocked, b_list) {
                if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
-                        nlmsvc_insert_block(block, 0);
+                        nlmsvc_insert_block_locked(block, 0);
+                        spin_unlock(&nlm_blocked_lock);
                        svc_wake_up(block->b_daemon);
                        return;
                }
        }
+        spin_unlock(&nlm_blocked_lock);
        printk(KERN_WARNING "lockd: notification for unknown block!\n");
 }
@@ -803,7 +815,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
        dprintk("lockd: GRANT_MSG RPC callback\n");
-        lock_kernel();
+        spin_lock(&nlm_blocked_lock);
        /* if the block is not on a list at this point then it has
         * been invalidated. Don't try to requeue it.
         *
@@ -825,19 +837,20 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
                /* Call was successful, now wait for client callback */
                timeout = 60 * HZ;
        }
-        nlmsvc_insert_block(block, timeout);
+        nlmsvc_insert_block_locked(block, timeout);
        svc_wake_up(block->b_daemon);
 out:
-        unlock_kernel();
+        spin_unlock(&nlm_blocked_lock);
 }
+/*
+ * FIXME: nlmsvc_release_block() grabs a mutex.  This is not allowed for an
+ * .rpc_release rpc_call_op
+ */
 static void nlmsvc_grant_release(void *data)
 {
        struct nlm_rqst         *call = data;
-        lock_kernel();
        nlmsvc_release_block(call->a_block);
-        unlock_kernel();
 }
 static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 0f2ab741ae7c..c3069f38d602 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -260,9 +260,7 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
 static void nlmsvc_callback_release(void *data)
 {
-        lock_kernel();
        nlm_release_call(data);
-        unlock_kernel();
 }
 static const struct rpc_call_ops nlmsvc_callback_ops = {
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d0ef94cfb3da..1ca0679c80bf 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -170,6 +170,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 again:
        file->f_locks = 0;
+        lock_flocks(); /* protects i_flock list */
        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
                if (fl->fl_lmops != &nlmsvc_lock_operations)
                        continue;
@@ -181,6 +182,7 @@ again:
                if (match(lockhost, host)) {
                        struct file_lock lock = *fl;
+                        unlock_flocks();
                        lock.fl_type  = F_UNLCK;
                        lock.fl_start = 0;
                        lock.fl_end   = OFFSET_MAX;
@@ -192,6 +194,7 @@ again:
                        goto again;
                }
        }
+        unlock_flocks();
        return 0;
 }
@@ -226,10 +229,14 @@ nlm_file_inuse(struct nlm_file *file)
        if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
                return 1;
+        lock_flocks();
        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
-                if (fl->fl_lmops == &nlmsvc_lock_operations)
+                if (fl->fl_lmops == &nlmsvc_lock_operations) {
+                        unlock_flocks();
                        return 1;
+                }
        }
+        unlock_flocks();
        file->f_locks = 0;
        return 0;
 }
diff --git a/fs/locks.c b/fs/locks.c
index ab24d49fc048..0e62dd35d088 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -142,14 +142,32 @@ int lease_break_time = 45;
 static LIST_HEAD(file_lock_list);
 static LIST_HEAD(blocked_list);
+static DEFINE_SPINLOCK(file_lock_lock);
+/*
+ * Protects the two list heads above, plus the inode->i_flock list
+ * FIXME: should use a spinlock, once lockd and ceph are ready.
+ */
+void lock_flocks(void)
+{
+        spin_lock(&file_lock_lock);
+}
+EXPORT_SYMBOL_GPL(lock_flocks);
+void unlock_flocks(void)
+{
+        spin_unlock(&file_lock_lock);
+}
+EXPORT_SYMBOL_GPL(unlock_flocks);
 static struct kmem_cache *filelock_cache __read_mostly;
 /* Allocate an empty lock structure. */
-static struct file_lock *locks_alloc_lock(void)
+struct file_lock *locks_alloc_lock(void)
 {
        return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
 }
+EXPORT_SYMBOL_GPL(locks_alloc_lock);
 void locks_release_private(struct file_lock *fl)
 {
@@ -168,7 +186,7 @@ void locks_release_private(struct file_lock *fl)
 EXPORT_SYMBOL_GPL(locks_release_private);
 /* Free a lock which is not in use. */
-static void locks_free_lock(struct file_lock *fl)
+void locks_free_lock(struct file_lock *fl)
 {
        BUG_ON(waitqueue_active(&fl->fl_wait));
        BUG_ON(!list_empty(&fl->fl_block));
@@ -177,6 +195,7 @@ static void locks_free_lock(struct file_lock *fl)
        locks_release_private(fl);
        kmem_cache_free(filelock_cache, fl);
 }
+EXPORT_SYMBOL(locks_free_lock);
 void locks_init_lock(struct file_lock *fl)
 {
@@ -216,11 +235,8 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl)
                        fl->fl_ops->fl_copy_lock(new, fl);
                new->fl_ops = fl->fl_ops;
        }
-        if (fl->fl_lmops) {
+        if (fl->fl_lmops)
-                if (fl->fl_lmops->fl_copy_lock)
-                        fl->fl_lmops->fl_copy_lock(new, fl);
                new->fl_lmops = fl->fl_lmops;
-        }
 }
 /*
@@ -511,9 +527,9 @@ static void __locks_delete_block(struct file_lock *waiter)
 */
 static void locks_delete_block(struct file_lock *waiter)
 {
-        lock_kernel();
+        lock_flocks();
        __locks_delete_block(waiter);
-        unlock_kernel();
+        unlock_flocks();
 }
 /* Insert waiter into blocker's block list.
@@ -644,7 +660,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 {
        struct file_lock *cfl;
-        lock_kernel();
+        lock_flocks();
        for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
                if (!IS_POSIX(cfl))
                        continue;
@@ -657,7 +673,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
                        fl->fl_pid = pid_vnr(cfl->fl_nspid);
        } else
                fl->fl_type = F_UNLCK;
-        unlock_kernel();
+        unlock_flocks();
        return;
 }
 EXPORT_SYMBOL(posix_test_lock);
@@ -730,18 +746,16 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
        int error = 0;
        int found = 0;
-        lock_kernel();
+        if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
-        if (request->fl_flags & FL_ACCESS)
-                goto find_conflict;
-        if (request->fl_type != F_UNLCK) {
-                error = -ENOMEM;
                new_fl = locks_alloc_lock();
-                if (new_fl == NULL)
+                if (!new_fl)
-                        goto out;
+                        return -ENOMEM;
-                error = 0;
        }
+        lock_flocks();
+        if (request->fl_flags & FL_ACCESS)
+                goto find_conflict;
        for_each_lock(inode, before) {
                struct file_lock *fl = *before;
                if (IS_POSIX(fl))
@@ -767,8 +781,11 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
         * If a higher-priority process was blocked on the old file lock,
         * give it the opportunity to lock the file.
         */
-        if (found)
+        if (found) {
+                unlock_flocks();
                cond_resched();
+                lock_flocks();
+        }
 find_conflict:
        for_each_lock(inode, before) {
@@ -794,7 +811,7 @@ find_conflict:
        error = 0;
 out:
-        unlock_kernel();
+        unlock_flocks();
        if (new_fl)
                locks_free_lock(new_fl);
        return error;
@@ -823,7 +840,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                new_fl2 = locks_alloc_lock();
        }
-        lock_kernel();
+        lock_flocks();
        if (request->fl_type != F_UNLCK) {
                for_each_lock(inode, before) {
                        fl = *before;
@@ -991,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                locks_wake_up_blocks(left);
        }
 out:
-        unlock_kernel();
+        unlock_flocks();
        /*
         * Free any unused locks.
         */
@@ -1066,14 +1083,14 @@ int locks_mandatory_locked(struct inode *inode)
        /*
         * Search the lock list for this inode for any POSIX locks.
         */
-        lock_kernel();
+        lock_flocks();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!IS_POSIX(fl))
                        continue;
                if (fl->fl_owner != owner)
                        break;
        }
-        unlock_kernel();
+        unlock_flocks();
        return fl ? -EAGAIN : 0;
 }
@@ -1186,7 +1203,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
        new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
-        lock_kernel();
+        lock_flocks();
        time_out_leases(inode);
@@ -1247,8 +1264,10 @@ restart:
                        break_time++;
        }
        locks_insert_block(flock, new_fl);
+        unlock_flocks();
        error = wait_event_interruptible_timeout(new_fl->fl_wait,
                                                !new_fl->fl_next, break_time);
+        lock_flocks();
        __locks_delete_block(new_fl);
        if (error >= 0) {
                if (error == 0)
@@ -1263,7 +1282,7 @@ restart:
        }
 out:
-        unlock_kernel();
+        unlock_flocks();
        if (!IS_ERR(new_fl))
                locks_free_lock(new_fl);
        return error;
@@ -1319,7 +1338,7 @@ int fcntl_getlease(struct file *filp)
        struct file_lock *fl;
        int type = F_UNLCK;
-        lock_kernel();
+        lock_flocks();
        time_out_leases(filp->f_path.dentry->d_inode);
        for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
                        fl = fl->fl_next) {
@@ -1328,7 +1347,7 @@ int fcntl_getlease(struct file *filp)
                        break;
                }
        }
-        unlock_kernel();
+        unlock_flocks();
        return type;
 }
@@ -1341,36 +1360,32 @@ int fcntl_getlease(struct file *filp)
 *      The (input) flp->fl_lmops->fl_break function is required
 *      by break_lease().
 *
- *      Called with kernel lock held.
+ *      Called with file_lock_lock held.
 */
 int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 {
        struct file_lock *fl, **before, **my_before = NULL, *lease;
-        struct file_lock *new_fl = NULL;
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        int error, rdlease_count = 0, wrlease_count = 0;
+        lease = *flp;
+        error = -EACCES;
        if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
-                return -EACCES;
+                goto out;
+        error = -EINVAL;
        if (!S_ISREG(inode->i_mode))
-                return -EINVAL;
+                goto out;
        error = security_file_lock(filp, arg);
        if (error)
-                return error;
+                goto out;
        time_out_leases(inode);
        BUG_ON(!(*flp)->fl_lmops->fl_break);
-        lease = *flp;
        if (arg != F_UNLCK) {
-                error = -ENOMEM;
-                new_fl = locks_alloc_lock();
-                if (new_fl == NULL)
-                        goto out;
                error = -EAGAIN;
                if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
                        goto out;
@@ -1410,12 +1425,12 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
                goto out;
        if (my_before != NULL) {
-                *flp = *my_before;
                error = lease->fl_lmops->fl_change(my_before, arg);
+                if (!error)
+                        *flp = *my_before;
                goto out;
        }
-        error = 0;
        if (arg == F_UNLCK)
                goto out;
@@ -1423,20 +1438,23 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
        if (!leases_enable)
                goto out;
-        locks_copy_lock(new_fl, lease);
+        locks_insert_lock(before, lease);
-        locks_insert_lock(before, new_fl);
-        *flp = new_fl;
        return 0;
 out:
-        if (new_fl != NULL)
-                locks_free_lock(new_fl);
        return error;
 }
 EXPORT_SYMBOL(generic_setlease);
- /**
+static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
+{
+        if (filp->f_op && filp->f_op->setlease)
+                return filp->f_op->setlease(filp, arg, lease);
+        else
+                return generic_setlease(filp, arg, lease);
+}
+/**
 *      vfs_setlease        -       sets a lease on an open file
 *      @filp: file pointer
 *      @arg: type of lease to obtain
@@ -1467,17 +1485,67 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 {
        int error;
-        lock_kernel();
+        lock_flocks();
-        if (filp->f_op && filp->f_op->setlease)
+        error = __vfs_setlease(filp, arg, lease);
-                error = filp->f_op->setlease(filp, arg, lease);
+        unlock_flocks();
-        else
-                error = generic_setlease(filp, arg, lease);
-        unlock_kernel();
        return error;
 }
 EXPORT_SYMBOL_GPL(vfs_setlease);
+static int do_fcntl_delete_lease(struct file *filp)
+{
+        struct file_lock fl, *flp = &fl;
+        lease_init(filp, F_UNLCK, flp);
+        return vfs_setlease(filp, F_UNLCK, &flp);
+}
+static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
+{
+        struct file_lock *fl, *ret;
+        struct fasync_struct *new;
+        int error;
+        fl = lease_alloc(filp, arg);
+        if (IS_ERR(fl))
+                return PTR_ERR(fl);
+        new = fasync_alloc();
+        if (!new) {
+                locks_free_lock(fl);
+                return -ENOMEM;
+        }
+        ret = fl;
+        lock_flocks();
+        error = __vfs_setlease(filp, arg, &ret);
+        if (error) {
+                unlock_flocks();
+                locks_free_lock(fl);
+                goto out_free_fasync;
+        }
+        if (ret != fl)
+                locks_free_lock(fl);
+        /*
+         * fasync_insert_entry() returns the old entry if any.
+         * If there was no old entry, then it used 'new' and
+         * inserted it into the fasync list. Clear new so that
+         * we don't release it here.
+         */
+        if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
+                new = NULL;
+        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+        unlock_flocks();
+out_free_fasync:
+        if (new)
+                fasync_free(new);
+        return error;
+}
 /**
 *      fcntl_setlease  -       sets a lease on an open file
 *      @fd: open file descriptor
@@ -1490,34 +1558,9 @@ EXPORT_SYMBOL_GPL(vfs_setlease);
 */
 int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 {
-        struct file_lock fl, *flp = &fl;
+        if (arg == F_UNLCK)
-        struct inode *inode = filp->f_path.dentry->d_inode;
+                return do_fcntl_delete_lease(filp);
-        int error;
+        return do_fcntl_add_lease(fd, filp, arg);
-        locks_init_lock(&fl);
-        error = lease_init(filp, arg, &fl);
-        if (error)
-                return error;
-        lock_kernel();
-        error = vfs_setlease(filp, arg, &flp);
-        if (error || arg == F_UNLCK)
-                goto out_unlock;
-        error = fasync_helper(fd, filp, 1, &flp->fl_fasync);
-        if (error < 0) {
-                /* remove lease just inserted by setlease */
-                flp->fl_type = F_UNLCK | F_INPROGRESS;
-                flp->fl_break_time = jiffies - 10;
-                time_out_leases(inode);
-                goto out_unlock;
-        }
-        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-out_unlock:
-        unlock_kernel();
-        return error;
 }
 /**
@@ -2020,7 +2063,7 @@ void locks_remove_flock(struct file *filp)
                        fl.fl_ops->fl_release_private(&fl);
        }
-        lock_kernel();
+        lock_flocks();
        before = &inode->i_flock;
        while ((fl = *before) != NULL) {
@@ -2038,7 +2081,7 @@ void locks_remove_flock(struct file *filp)
                }
                before = &fl->fl_next;
        }
-        unlock_kernel();
+        unlock_flocks();
 }
 /**
@@ -2053,12 +2096,12 @@ posix_unblock_lock(struct file *filp, struct file_lock *waiter)
 {
        int status = 0;
-        lock_kernel();
+        lock_flocks();
        if (waiter->fl_next)
                __locks_delete_block(waiter);
        else
                status = -ENOENT;
-        unlock_kernel();
+        unlock_flocks();
        return status;
 }
@@ -2085,7 +2128,7 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 #include <linux/seq_file.h>
 static void lock_get_status(struct seq_file *f, struct file_lock *fl,
-                                                        int id, char *pfx)
+                            loff_t id, char *pfx)
 {
        struct inode *inode = NULL;
        unsigned int fl_pid;
@@ -2098,7 +2141,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
        if (fl->fl_file != NULL)
                inode = fl->fl_file->f_path.dentry->d_inode;
-        seq_printf(f, "%d:%s ", id, pfx);
+        seq_printf(f, "%lld:%s ", id, pfx);
        if (IS_POSIX(fl)) {
                seq_printf(f, "%6s %s ",
                             (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
@@ -2161,30 +2204,33 @@ static int locks_show(struct seq_file *f, void *v)
        fl = list_entry(v, struct file_lock, fl_link);
-        lock_get_status(f, fl, (long)f->private, "");
+        lock_get_status(f, fl, *((loff_t *)f->private), "");
        list_for_each_entry(bfl, &fl->fl_block, fl_block)
-                lock_get_status(f, bfl, (long)f->private, " ->");
+                lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
-        f->private++;
        return 0;
 }
 static void *locks_start(struct seq_file *f, loff_t *pos)
 {
-        lock_kernel();
+        loff_t *p = f->private;
-        f->private = (void *)1;
+        lock_flocks();
+        *p = (*pos + 1);
        return seq_list_start(&file_lock_list, *pos);
 }
 static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 {
+        loff_t *p = f->private;
+        ++*p;
        return seq_list_next(v, &file_lock_list, pos);
 }
 static void locks_stop(struct seq_file *f, void *v)
 {
-        unlock_kernel();
+        unlock_flocks();
 }
 static const struct seq_operations locks_seq_operations = {
@@ -2196,14 +2242,14 @@ static const struct seq_operations locks_seq_operations = {
 static int locks_open(struct inode *inode, struct file *filp)
 {
-        return seq_open(filp, &locks_seq_operations);
+        return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
 }
 static const struct file_operations proc_locks_operations = {
        .open           = locks_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release,
+        .release        = seq_release_private,
 };
 static int __init proc_locks_init(void)
@@ -2231,7 +2277,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
 {
        struct file_lock *fl;
        int result = 1;
-        lock_kernel();
+        lock_flocks();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (IS_POSIX(fl)) {
                        if (fl->fl_type == F_RDLCK)
@@ -2248,7 +2294,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
                result = 0;
                break;
        }
-        unlock_kernel();
+        unlock_flocks();
        return result;
 }
@@ -2271,7 +2317,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 {
        struct file_lock *fl;
        int result = 1;
-        lock_kernel();
+        lock_flocks();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (IS_POSIX(fl)) {
                        if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2286,7 +2332,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
                result = 0;
                break;
        }
-        unlock_kernel();
+        unlock_flocks();
        return result;
 }
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9bd2ce2a3040..92ca6fbe09bd 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -298,9 +298,9 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
        return sync_request(page, bdev, WRITE);
 }
-static void bdev_put_device(struct super_block *sb)
+static void bdev_put_device(struct logfs_super *s)
 {
-        close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
+        close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
 }
 static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -320,8 +320,8 @@ static const struct logfs_device_ops bd_devops = {
        .put_device     = bdev_put_device,
 };
-int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
-                const char *devname, struct vfsmount *mnt)
+                const char *devname)
 {
        struct block_device *bdev;
@@ -332,8 +332,11 @@ int logfs_get_sb_bdev(struct file_system_type *type, int flags,
        if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
                int mtdnr = MINOR(bdev->bd_dev);
                close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
-                return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+                return logfs_get_sb_mtd(p, mtdnr);
        }
-        return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
+        p->s_bdev = bdev;
+        p->s_mtd = NULL;
+        p->s_devops = &bd_devops;
+        return 0;
 }
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index a85d47d13e4b..7466e9dcc8c5 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -230,9 +230,9 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
        __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
 }
-static void mtd_put_device(struct super_block *sb)
+static void mtd_put_device(struct logfs_super *s)
 {
-        put_mtd_device(logfs_super(sb)->s_mtd);
+        put_mtd_device(s->s_mtd);
 }
 static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
@@ -265,14 +265,14 @@ static const struct logfs_device_ops mtd_devops = {
        .put_device     = mtd_put_device,
 };
-int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
-                int mtdnr, struct vfsmount *mnt)
 {
-        struct mtd_info *mtd;
+        struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
-        const struct logfs_device_ops *devops = &mtd_devops;
-        mtd = get_mtd_device(NULL, mtdnr);
        if (IS_ERR(mtd))
                return PTR_ERR(mtd);
-        return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
+        s->s_bdev = NULL;
+        s->s_mtd = mtd;
+        s->s_devops = &mtd_devops;
+        return 0;
 }
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9777eb5b5522..409dfd65e9a1 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -569,7 +569,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,
                return -EMLINK;
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        inode->i_nlink++;
        mark_inode_dirty_sync(inode);
@@ -827,4 +827,5 @@ const struct file_operations logfs_dir_fops = {
        .unlocked_ioctl = logfs_ioctl,
        .readdir        = logfs_readdir,
        .read           = generic_read_dir,
+        .llseek         = default_llseek,
 };
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index b8786264d243..57afd4a6fabb 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -136,6 +136,7 @@ struct logfs_area_ops {
        int     (*erase_segment)(struct logfs_area *area);
 };
+struct logfs_super;     /* forward */
 /**
 * struct logfs_device_ops - device access operations
 *
@@ -156,7 +157,7 @@ struct logfs_device_ops {
                        int ensure_write);
        int (*can_write_buf)(struct super_block *sb, u64 ofs);
        void (*sync)(struct super_block *sb);
-        void (*put_device)(struct super_block *sb);
+        void (*put_device)(struct logfs_super *s);
 };
 /**
@@ -471,11 +472,13 @@ void logfs_compr_exit(void);
 /* dev_bdev.c */
 #ifdef CONFIG_BLOCK
-int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+int logfs_get_sb_bdev(struct logfs_super *s,
-                const char *devname, struct vfsmount *mnt);
+                struct file_system_type *type,
+                const char *devname);
 #else
-static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+static inline int logfs_get_sb_bdev(struct logfs_super *s,
-                const char *devname, struct vfsmount *mnt)
+                struct file_system_type *type,
+                const char *devname)
 {
        return -ENODEV;
 }
@@ -483,11 +486,9 @@ static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
 /* dev_mtd.c */
 #ifdef CONFIG_MTD
-int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
-                int mtdnr, struct vfsmount *mnt);
 #else
-static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
-                int mtdnr, struct vfsmount *mnt)
 {
        return -ENODEV;
 }
@@ -619,9 +620,6 @@ void emergency_read_end(struct page *page);
 void logfs_crash_dump(struct super_block *sb);
 void *memchr_inv(const void *s, int c, size_t n);
 int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
-int logfs_get_sb_device(struct file_system_type *type, int flags,
-                struct mtd_info *mtd, struct block_device *bdev,
-                const struct logfs_device_ops *devops, struct vfsmount *mnt);
 int logfs_check_ds(struct logfs_disk_super *ds);
 int logfs_write_sb(struct super_block *sb);
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 5336155c5d81..33435e4b14d2 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -325,7 +325,7 @@ static int logfs_make_writeable(struct super_block *sb)
        return 0;
 }
-static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
+static int logfs_get_sb_final(struct super_block *sb)
 {
        struct logfs_super *super = logfs_super(sb);
        struct inode *rootdir;
@@ -356,7 +356,6 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
        }
        log_super("LogFS: Finished mounting\n");
-        simple_set_mnt(mnt, sb);
        return 0;
 fail:
@@ -529,43 +528,37 @@ static void logfs_kill_sb(struct super_block *sb)
        logfs_cleanup_rw(sb);
        if (super->s_erase_page)
                __free_page(super->s_erase_page);
-        super->s_devops->put_device(sb);
+        super->s_devops->put_device(super);
        logfs_mempool_destroy(super->s_btree_pool);
        logfs_mempool_destroy(super->s_alias_pool);
        kfree(super);
        log_super("LogFS: Finished unmounting\n");
 }
-int logfs_get_sb_device(struct file_system_type *type, int flags,
+static struct dentry *logfs_get_sb_device(struct logfs_super *super,
-                struct mtd_info *mtd, struct block_device *bdev,
+                struct file_system_type *type, int flags)
-                const struct logfs_device_ops *devops, struct vfsmount *mnt)
 {
-        struct logfs_super *super;
        struct super_block *sb;
        int err = -ENOMEM;
        static int mount_count;
        log_super("LogFS: Start mount %x\n", mount_count++);
-        super = kzalloc(sizeof(*super), GFP_KERNEL);
-        if (!super)
-                goto err0;
-        super->s_mtd    = mtd;
-        super->s_bdev   = bdev;
        err = -EINVAL;
        sb = sget(type, logfs_sb_test, logfs_sb_set, super);
-        if (IS_ERR(sb))
+        if (IS_ERR(sb)) {
-                goto err0;
+                super->s_devops->put_device(super);
+                kfree(super);
+                return ERR_CAST(sb);
+        }
        if (sb->s_root) {
                /* Device is already in use */
-                err = 0;
+                super->s_devops->put_device(super);
-                simple_set_mnt(mnt, sb);
+                kfree(super);
-                goto err0;
+                return dget(sb->s_root);
        }
-        super->s_devops = devops;
        /*
         * sb->s_maxbytes is limited to 8TB.  On 32bit systems, the page cache
         * only covers 16TB and the upper 8TB are used for indirect blocks.
@@ -581,10 +574,12 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
                goto err1;
        sb->s_flags |= MS_ACTIVE;
-        err = logfs_get_sb_final(sb, mnt);
+        err = logfs_get_sb_final(sb);
-        if (err)
+        if (err) {
                deactivate_locked_super(sb);
-        return err;
+                return ERR_PTR(err);
+        }
+        return dget(sb->s_root);
 err1:
        /* no ->s_root, no ->put_super() */
@@ -592,37 +587,45 @@ err1:
        iput(super->s_segfile_inode);
        iput(super->s_mapping_inode);
        deactivate_locked_super(sb);
-        return err;
+        return ERR_PTR(err);
-err0:
-        kfree(super);
-        //devops->put_device(sb);
-        return err;
 }
-static int logfs_get_sb(struct file_system_type *type, int flags,
+static struct dentry *logfs_mount(struct file_system_type *type, int flags,
-                const char *devname, void *data, struct vfsmount *mnt)
+                const char *devname, void *data)
 {
        ulong mtdnr;
+        struct logfs_super *super;
+        int err;
-        if (!devname)
+        super = kzalloc(sizeof(*super), GFP_KERNEL);
-                return logfs_get_sb_bdev(type, flags, devname, mnt);
+        if (!super)
-        if (strncmp(devname, "mtd", 3))
+                return ERR_PTR(-ENOMEM);
-                return logfs_get_sb_bdev(type, flags, devname, mnt);
-        {
+        if (!devname)
+                err = logfs_get_sb_bdev(super, type, devname);
+        else if (strncmp(devname, "mtd", 3))
+                err = logfs_get_sb_bdev(super, type, devname);
+        else {
                char *garbage;
                mtdnr = simple_strtoul(devname+3, &garbage, 0);
                if (*garbage)
-                        return -EINVAL;
+                        err = -EINVAL;
+                else
+                        err = logfs_get_sb_mtd(super, mtdnr);
+        }
+        if (err) {
+                kfree(super);
+                return ERR_PTR(err);
        }
-        return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+        return logfs_get_sb_device(super, type, flags);
 }
 static struct file_system_type logfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "logfs",
-        .get_sb         = logfs_get_sb,
+        .mount          = logfs_mount,
        .kill_sb        = logfs_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV,
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e39d6bf2e8fb..fb2020858a34 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -614,17 +614,16 @@ void minix_truncate(struct inode * inode)
                V2_minix_truncate(inode);
 }
-static int minix_get_sb(struct file_system_type *fs_type,
+static struct dentry *minix_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
-                           mnt);
 }
 static struct file_system_type minix_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "minix",
-        .get_sb         = minix_get_sb,
+        .mount          = minix_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f3f3578393a4..c0d35a3accef 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -101,7 +101,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        return add_nondir(dentry, inode);
 }
diff --git a/fs/namei.c b/fs/namei.c
index 24896e833565..5362af9b7372 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1121,11 +1121,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 static struct dentry *__lookup_hash(struct qstr *name,
                struct dentry *base, struct nameidata *nd)
 {
+        struct inode *inode = base->d_inode;
        struct dentry *dentry;
-        struct inode *inode;
        int err;
-        inode = base->d_inode;
+        err = exec_permission(inode);
+        if (err)
+                return ERR_PTR(err);
        /*
         * See if the low-level filesystem might want
@@ -1161,11 +1163,6 @@ out:
 */
 static struct dentry *lookup_hash(struct nameidata *nd)
 {
-        int err;
-        err = exec_permission(nd->path.dentry->d_inode);
-        if (err)
-                return ERR_PTR(err);
        return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
@@ -1213,9 +1210,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        if (err)
                return ERR_PTR(err);
-        err = exec_permission(base->d_inode);
-        if (err)
-                return ERR_PTR(err);
        return __lookup_hash(&this, base, NULL);
 }
@@ -1580,6 +1574,7 @@ static struct file *finish_open(struct nameidata *nd,
         */
        if (will_truncate)
                mnt_drop_write(nd->path.mnt);
+        path_put(&nd->path);
        return filp;
 exit:
@@ -1681,6 +1676,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                }
                filp = nameidata_to_filp(nd);
                mnt_drop_write(nd->path.mnt);
+                path_put(&nd->path);
                if (!IS_ERR(filp)) {
                        error = ima_file_check(filp, acc_mode);
                        if (error) {
@@ -2291,7 +2287,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                        goto slashes;
                inode = dentry->d_inode;
                if (inode)
-                        atomic_inc(&inode->i_count);
+                        ihold(inode);
                error = mnt_want_write(nd.path.mnt);
                if (error)
                        goto exit2;
diff --git a/fs/namespace.c b/fs/namespace.c
index a72eaabfe8f2..8a415c9c5e55 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -595,7 +595,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
                                goto out_free;
                }
-                mnt->mnt_flags = old->mnt_flags;
+                mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
                atomic_inc(&sb->s_active);
                mnt->mnt_sb = sb;
                mnt->mnt_root = dget(root);
@@ -1744,9 +1744,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        lock_kernel();
        mnt = do_kern_mount(type, flags, name, data);
-        unlock_kernel();
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9578cbe0cd58..aac8832e919e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -95,6 +95,34 @@ const struct dentry_operations ncp_root_dentry_operations =
 };
+#define ncp_namespace(i)        (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
+static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
+{
+#ifdef CONFIG_NCPFS_SMALLDOS
+        int ns = ncp_namespace(i);
+        if ((ns == NW_NS_DOS)
+#ifdef CONFIG_NCPFS_OS2_NS
+                || ((ns == NW_NS_OS2) && (nscreator == NW_NS_DOS))
+#endif /* CONFIG_NCPFS_OS2_NS */
+           )
+                return 0;
+#endif /* CONFIG_NCPFS_SMALLDOS */
+        return 1;
+}
+#define ncp_preserve_case(i)    (ncp_namespace(i) != NW_NS_DOS)
+static inline int ncp_case_sensitive(struct dentry *dentry)
+{
+#ifdef CONFIG_NCPFS_NFS_NS
+        return ncp_namespace(dentry->d_inode) == NW_NS_NFS;
+#else
+        return 0;
+#endif /* CONFIG_NCPFS_NFS_NS */
+}
 /*
 * Note: leave the hash unchanged if the directory
 * is case-sensitive.
@@ -102,13 +130,12 @@ const struct dentry_operations ncp_root_dentry_operations =
 static int 
 ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
 {
-        struct nls_table *t;
+        if (!ncp_case_sensitive(dentry)) {
-        unsigned long hash;
+                struct nls_table *t;
-        int i;
+                unsigned long hash;
+                int i;
-        t = NCP_IO_TABLE(dentry);
-        if (!ncp_case_sensitive(dentry->d_inode)) {
+                t = NCP_IO_TABLE(dentry);
                hash = init_name_hash();
                for (i=0; i<this->len ; i++)
                        hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -124,7 +151,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
        if (a->len != b->len)
                return 1;
-        if (ncp_case_sensitive(dentry->d_inode))
+        if (ncp_case_sensitive(dentry))
                return strncmp(a->name, b->name, a->len);
        return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len);
@@ -266,7 +293,7 @@ leave_me:;
 static int
-__ncp_lookup_validate(struct dentry *dentry)
+ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
 {
        struct ncp_server *server;
        struct dentry *parent;
@@ -283,9 +310,6 @@ __ncp_lookup_validate(struct dentry *dentry)
        server = NCP_SERVER(dir);
-        if (!ncp_conn_valid(server))
-                goto finished;
        /*
         * Inspired by smbfs:
         * The default validation is based on dentry age:
@@ -304,8 +328,11 @@ __ncp_lookup_validate(struct dentry *dentry)
        if (ncp_is_server_root(dir)) {
                res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
                                 dentry->d_name.len, 1);
-                if (!res)
+                if (!res) {
                        res = ncp_lookup_volume(server, __name, &(finfo.i));
+                        if (!res)
+                                ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
+                }
        } else {
                res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
                                 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -320,13 +347,17 @@ __ncp_lookup_validate(struct dentry *dentry)
         * what we remember, it's not valid any more.
         */
        if (!res) {
-                if (finfo.i.dirEntNum == NCP_FINFO(dentry->d_inode)->dirEntNum) {
+                struct inode *inode = dentry->d_inode;
+                mutex_lock(&inode->i_mutex);
+                if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
                        ncp_new_dentry(dentry);
                        val=1;
                } else
                        DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n");
-                ncp_update_inode2(dentry->d_inode, &finfo);
+                ncp_update_inode2(inode, &finfo);
+                mutex_unlock(&inode->i_mutex);
        }
 finished:
@@ -335,16 +366,6 @@ finished:
        return val;
 }
-static int
-ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
-{
-        int res;
-        lock_kernel();
-        res = __ncp_lookup_validate(dentry);
-        unlock_kernel();
-        return res;
-}
 static struct dentry *
 ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 {
@@ -411,8 +432,6 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int result, mtime_valid = 0;
        time_t mtime = 0;
-        lock_kernel();
        ctl.page  = NULL;
        ctl.cache = NULL;
@@ -421,6 +440,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
                (int) filp->f_pos);
        result = -EIO;
+        /* Do not generate '.' and '..' when server is dead. */
        if (!ncp_conn_valid(server))
                goto out;
@@ -532,6 +552,12 @@ read_really:
        ctl.head.end = ctl.fpos - 1;
        ctl.head.eof = ctl.valid;
 finished:
+        if (ctl.page) {
+                kunmap(ctl.page);
+                SetPageUptodate(ctl.page);
+                unlock_page(ctl.page);
+                page_cache_release(ctl.page);
+        }
        if (page) {
                cache->head = ctl.head;
                kunmap(page);
@@ -539,23 +565,17 @@ finished:
                unlock_page(page);
                page_cache_release(page);
        }
-        if (ctl.page) {
-                kunmap(ctl.page);
-                SetPageUptodate(ctl.page);
-                unlock_page(ctl.page);
-                page_cache_release(ctl.page);
-        }
 out:
-        unlock_kernel();
        return result;
 }
 static int
 ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-                struct ncp_cache_control *ctrl, struct ncp_entry_info *entry)
+                struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
+                int inval_childs)
 {
        struct dentry *newdent, *dentry = filp->f_path.dentry;
-        struct inode *newino, *inode = dentry->d_inode;
+        struct inode *dir = dentry->d_inode;
        struct ncp_cache_control ctl = *ctrl;
        struct qstr qname;
        int valid = 0;
@@ -564,9 +584,9 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
        __u8 __name[NCP_MAXPATHLEN + 1];
        qname.len = sizeof(__name);
-        if (ncp_vol2io(NCP_SERVER(inode), __name, &qname.len,
+        if (ncp_vol2io(NCP_SERVER(dir), __name, &qname.len,
                        entry->i.entryName, entry->i.nameLen,
-                        !ncp_preserve_entry_case(inode, entry->i.NSCreator)))
+                        !ncp_preserve_entry_case(dir, entry->i.NSCreator)))
                return 1; /* I'm not sure */
        qname.name = __name;
@@ -584,22 +604,64 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
                        goto end_advance;
        } else {
                hashed = 1;
-                memcpy((char *) newdent->d_name.name, qname.name,
-                                                        newdent->d_name.len);
+                /* If case sensitivity changed for this volume, all entries below this one
+                   should be thrown away.  This entry itself is not affected, as its case
+                   sensitivity is controlled by its own parent. */
+                if (inval_childs)
+                        shrink_dcache_parent(newdent);
+                /*
+                 * It is not as dangerous as it looks.  NetWare's OS2 namespace is
+                 * case preserving yet case insensitive.  So we update dentry's name
+                 * as received from server.  We found dentry via d_lookup with our
+                 * hash, so we know that hash does not change, and so replacing name
+                 * should be reasonably safe.
+                 */
+                if (qname.len == newdent->d_name.len &&
+                    memcmp(newdent->d_name.name, qname.name, newdent->d_name.len)) {
+                        struct inode *inode = newdent->d_inode;
+                        /*
+                         * Inside ncpfs all uses of d_name are either for debugging,
+                         * or on functions which acquire inode mutex (mknod, creat,
+                         * lookup).  So grab i_mutex here, to be sure.  d_path
+                         * uses dcache_lock when generating path, so we should too.
+                         * And finally d_compare is protected by dentry's d_lock, so
+                         * here we go.
+                         */
+                        if (inode)
+                                mutex_lock(&inode->i_mutex);
+                        spin_lock(&dcache_lock);
+                        spin_lock(&newdent->d_lock);
+                        memcpy((char *) newdent->d_name.name, qname.name,
+                                                                newdent->d_name.len);
+                        spin_unlock(&newdent->d_lock);
+                        spin_unlock(&dcache_lock);
+                        if (inode)
+                                mutex_unlock(&inode->i_mutex);
+                }
        }
        if (!newdent->d_inode) {
+                struct inode *inode;
                entry->opened = 0;
-                entry->ino = iunique(inode->i_sb, 2);
+                entry->ino = iunique(dir->i_sb, 2);
-                newino = ncp_iget(inode->i_sb, entry);
+                inode = ncp_iget(dir->i_sb, entry);
-                if (newino) {
+                if (inode) {
                        newdent->d_op = &ncp_dentry_operations;
-                        d_instantiate(newdent, newino);
+                        d_instantiate(newdent, inode);
                        if (!hashed)
                                d_rehash(newdent);
                }
-        } else
+        } else {
-                ncp_update_inode2(newdent->d_inode, entry);
+                struct inode *inode = newdent->d_inode;
+                mutex_lock(&inode->i_mutex);
+                ncp_update_inode2(inode, entry);
+                mutex_unlock(&inode->i_mutex);
+        }
        if (newdent->d_inode) {
                ino = newdent->d_inode->i_ino;
@@ -617,7 +679,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
                ctl.cache = NULL;
                ctl.idx  -= NCP_DIRCACHE_SIZE;
                ctl.ofs  += 1;
-                ctl.page  = grab_cache_page(&inode->i_data, ctl.ofs);
+                ctl.page  = grab_cache_page(&dir->i_data, ctl.ofs);
                if (ctl.page)
                        ctl.cache = kmap(ctl.page);
        }
@@ -633,7 +695,7 @@ end_advance:
                if (!ino)
                        ino = find_inode_number(dentry, &qname);
                if (!ino)
-                        ino = iunique(inode->i_sb, 2);
+                        ino = iunique(dir->i_sb, 2);
                ctl.filled = filldir(dirent, qname.name, qname.len,
                                     filp->f_pos, ino, DT_UNKNOWN);
                if (!ctl.filled)
@@ -660,6 +722,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
                        (unsigned long) filp->f_pos);
        for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
+                int inval_dentry;
                if (ncp_get_volume_info_with_number(server, i, &info) != 0)
                        return;
@@ -675,8 +738,9 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
                                info.volume_name);
                        continue;
                }
+                inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
                entry.volume = entry.i.volNumber;
-                if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry))
+                if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
                        return;
        }
 }
@@ -739,7 +803,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
                        rpl += onerpl;
                        rpls -= onerpl;
                        entry.volume = entry.i.volNumber;
-                        if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry))
+                        if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
                                break;
                }
        } while (more);
@@ -775,17 +839,19 @@ int ncp_conn_logged_in(struct super_block *sb)
                if (dent) {
                        struct inode* ino = dent->d_inode;
                        if (ino) {
+                                ncp_update_known_namespace(server, volNumber, NULL);
                                NCP_FINFO(ino)->volNumber = volNumber;
                                NCP_FINFO(ino)->dirEntNum = dirEntNum;
                                NCP_FINFO(ino)->DosDirNum = DosDirNum;
+                                result = 0;
                        } else {
                                DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n");
                        }
                } else {
                        DPRINTK("ncpfs: sb->s_root == NULL!\n");
                }
-        }
+        } else
-        result = 0;
+                result = 0;
 out:
        return result;
@@ -799,7 +865,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
        int error, res, len;
        __u8 __name[NCP_MAXPATHLEN + 1];
-        lock_kernel();
        error = -EIO;
        if (!ncp_conn_valid(server))
                goto finished;
@@ -813,6 +878,8 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
                                 dentry->d_name.len, 1);
                if (!res)
                        res = ncp_lookup_volume(server, __name, &(finfo.i));
+                        if (!res)
+                                ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
        } else {
                res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
                                 dentry->d_name.len, !ncp_preserve_case(dir));
@@ -846,7 +913,6 @@ add_entry:
 finished:
        PPRINTK("ncp_lookup: result=%d\n", error);
-        unlock_kernel();
        return ERR_PTR(error);
 }
@@ -887,11 +953,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
        PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n",
                dentry->d_parent->d_name.name, dentry->d_name.name, mode);
-        error = -EIO;
-        lock_kernel();
-        if (!ncp_conn_valid(server))
-                goto out;
        ncp_age_dentry(server, dentry);
        len = sizeof(__name);
        error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -917,6 +978,8 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
                if (result) {
                        if (result == 0x87)
                                error = -ENAMETOOLONG;
+                        else if (result < 0)
+                                error = result;
                        DPRINTK("ncp_create: %s/%s failed\n",
                                dentry->d_parent->d_name.name, dentry->d_name.name);
                        goto out;
@@ -935,7 +998,6 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode,
        error = ncp_instantiate(dir, dentry, &finfo);
 out:
-        unlock_kernel();
        return error;
 }
@@ -955,11 +1017,6 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        DPRINTK("ncp_mkdir: making %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        error = -EIO;
-        lock_kernel();
-        if (!ncp_conn_valid(server))
-                goto out;
        ncp_age_dentry(server, dentry);
        len = sizeof(__name);
        error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -967,12 +1024,11 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (error)
                goto out;
-        error = -EACCES;
+        error = ncp_open_create_file_or_subdir(server, dir, __name,
-        if (ncp_open_create_file_or_subdir(server, dir, __name,
                                           OC_MODE_CREATE, aDIR,
                                           cpu_to_le16(0xffff),
-                                           &finfo) == 0)
+                                           &finfo);
-        {
+        if (error == 0) {
                if (ncp_is_nfs_extras(server, finfo.volume)) {
                        mode |= S_IFDIR;
                        finfo.i.nfs.mode = mode;
@@ -983,9 +1039,10 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                                goto out;
                }
                error = ncp_instantiate(dir, dentry, &finfo);
+        } else if (error > 0) {
+                error = -EACCES;
        }
 out:
-        unlock_kernel();
        return error;
 }
@@ -998,11 +1055,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
        DPRINTK("ncp_rmdir: removing %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        error = -EIO;
-        lock_kernel();
-        if (!ncp_conn_valid(server))
-                goto out;
        error = -EBUSY;
        if (!d_unhashed(dentry))
                goto out;
@@ -1036,11 +1088,10 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
                        error = -ENOENT;
                        break;
                default:
-                        error = -EACCES;
+                        error = result < 0 ? result : -EACCES;
                        break;
        }
 out:
-        unlock_kernel();
        return error;
 }
@@ -1050,15 +1101,10 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
        struct ncp_server *server;
        int error;
-        lock_kernel();
        server = NCP_SERVER(dir);
        DPRINTK("ncp_unlink: unlinking %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
        
-        error = -EIO;
-        if (!ncp_conn_valid(server))
-                goto out;
        /*
         * Check whether to close the file ...
         */
@@ -1097,12 +1143,9 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
                        error = -ENOENT;
                        break;
                default:
-                        error = -EACCES;
+                        error = error < 0 ? error : -EACCES;
                        break;
        }
-                
-out:
-        unlock_kernel();
        return error;
 }
@@ -1118,11 +1161,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
                old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
-        error = -EIO;
-        lock_kernel();
-        if (!ncp_conn_valid(server))
-                goto out;
        ncp_age_dentry(server, old_dentry);
        ncp_age_dentry(server, new_dentry);
@@ -1161,11 +1199,10 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
                        error = -ENOENT;
                        break;
                default:
-                        error = -EACCES;
+                        error = error < 0 ? error : -EACCES;
                        break;
        }
 out:
-        unlock_kernel();
        return error;
 }
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 3639cc5cbdae..6c754f70c529 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -113,9 +113,6 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        DPRINTK("ncp_file_read: enter %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        if (!ncp_conn_valid(NCP_SERVER(inode)))
-                return -EIO;
        pos = *ppos;
        if ((ssize_t) count < 0) {
@@ -192,13 +189,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
        DPRINTK("ncp_file_write: enter %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        if (!ncp_conn_valid(NCP_SERVER(inode)))
-                return -EIO;
        if ((ssize_t) count < 0)
                return -EINVAL;
        pos = *ppos;
        if (file->f_flags & O_APPEND) {
-                pos = inode->i_size;
+                pos = i_size_read(inode);
        }
        if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
@@ -264,8 +259,11 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
        *ppos = pos;
-        if (pos > inode->i_size) {
+        if (pos > i_size_read(inode)) {
-                inode->i_size = pos;
+                mutex_lock(&inode->i_mutex);
+                if (pos > i_size_read(inode))
+                        i_size_write(inode, pos);
+                mutex_unlock(&inode->i_mutex);
        }
        DPRINTK("ncp_file_write: exit %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -281,18 +279,9 @@ static int ncp_release(struct inode *inode, struct file *file) {
        return 0;
 }
-static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
-{
-        loff_t ret;
-        lock_kernel();
-        ret = generic_file_llseek_unlocked(file, offset, origin);
-        unlock_kernel();
-        return ret;
-}
 const struct file_operations ncp_file_operations =
 {
-        .llseek         = ncp_remote_llseek,
+        .llseek         = generic_file_llseek,
        .read           = ncp_file_read,
        .write          = ncp_file_write,
        .unlocked_ioctl = ncp_ioctl,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index b4de38cf49f5..d290545aa0c4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -139,7 +139,7 @@ static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
                inode->i_mode = nwi->nfs.mode;
        }
-        inode->i_blocks = (inode->i_size + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
+        inode->i_blocks = (i_size_read(inode) + NCP_BLOCK_SIZE - 1) >> NCP_BLOCK_SHIFT;
        inode->i_mtime.tv_sec = ncp_date_dos2unix(nwi->modifyTime, nwi->modifyDate);
        inode->i_ctime.tv_sec = ncp_date_dos2unix(nwi->creationTime, nwi->creationDate);
@@ -158,18 +158,21 @@ static void ncp_update_attrs(struct inode *inode, struct ncp_entry_info *nwinfo)
                inode->i_mode = server->m.dir_mode;
                /* for directories dataStreamSize seems to be some
                   Object ID ??? */
-                inode->i_size = NCP_BLOCK_SIZE;
+                i_size_write(inode, NCP_BLOCK_SIZE);
        } else {
+                u32 size;
                inode->i_mode = server->m.file_mode;
-                inode->i_size = le32_to_cpu(nwi->dataStreamSize);
+                size = le32_to_cpu(nwi->dataStreamSize);
+                i_size_write(inode, size);
 #ifdef CONFIG_NCPFS_EXTRAS
                if ((server->m.flags & (NCP_MOUNT_EXTRAS|NCP_MOUNT_SYMLINKS)) 
                 && (nwi->attributes & aSHARED)) {
                        switch (nwi->attributes & (aHIDDEN|aSYSTEM)) {
                                case aHIDDEN:
                                        if (server->m.flags & NCP_MOUNT_SYMLINKS) {
-                                                if (/* (inode->i_size >= NCP_MIN_SYMLINK_SIZE)
+                                                if (/* (size >= NCP_MIN_SYMLINK_SIZE)
-                                                 && */ (inode->i_size <= NCP_MAX_SYMLINK_SIZE)) {
+                                                 && */ (size <= NCP_MAX_SYMLINK_SIZE)) {
                                                        inode->i_mode = (inode->i_mode & ~S_IFMT) | S_IFLNK;
                                                        NCP_FINFO(inode)->flags |= NCPI_KLUDGE_SYMLINK;
                                                        break;
@@ -208,7 +211,7 @@ void ncp_update_inode2(struct inode* inode, struct ncp_entry_info *nwinfo)
 }
 /*
- * Fill in the inode based on the ncp_entry_info structure.
+ * Fill in the inode based on the ncp_entry_info structure.  Used only for brand new inodes.
 */
 static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
 {
@@ -254,6 +257,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
        if (inode) {
                atomic_set(&NCP_FINFO(inode)->opened, info->opened);
+                inode->i_mapping->backing_dev_info = sb->s_bdi;
                inode->i_ino = info->ino;
                ncp_set_attr(inode, info);
                if (S_ISREG(inode->i_mode)) {
@@ -299,10 +303,12 @@ ncp_evict_inode(struct inode *inode)
 static void ncp_stop_tasks(struct ncp_server *server) {
        struct sock* sk = server->ncp_sock->sk;
-                
+        lock_sock(sk);
        sk->sk_error_report = server->error_report;
        sk->sk_data_ready   = server->data_ready;
        sk->sk_write_space  = server->write_space;
+        release_sock(sk);
        del_timer_sync(&server->timeout_tm);
        flush_scheduled_work();
 }
@@ -565,10 +571,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 /*      server->conn_status = 0;        */
 /*      server->root_dentry = NULL;     */
 /*      server->root_setuped = 0;       */
+        mutex_init(&server->root_setup_lock);
 #ifdef CONFIG_NCPFS_PACKET_SIGNING
 /*      server->sign_wanted = 0;        */
 /*      server->sign_active = 0;        */
 #endif
+        init_rwsem(&server->auth_rwsem);
        server->auth.auth_type = NCP_AUTH_NONE;
 /*      server->auth.object_name_len = 0;       */
 /*      server->auth.object_name = NULL;        */
@@ -593,16 +601,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        server->nls_io = load_nls_default();
 #endif /* CONFIG_NCPFS_NLS */
-        server->dentry_ttl = 0; /* no caching */
+        atomic_set(&server->dentry_ttl, 0);     /* no caching */
        INIT_LIST_HEAD(&server->tx.requests);
        mutex_init(&server->rcv.creq_mutex);
        server->tx.creq         = NULL;
        server->rcv.creq        = NULL;
-        server->data_ready      = sock->sk->sk_data_ready;
-        server->write_space     = sock->sk->sk_write_space;
-        server->error_report    = sock->sk->sk_error_report;
-        sock->sk->sk_user_data  = server;
        init_timer(&server->timeout_tm);
 #undef NCP_PACKET_SIZE
@@ -619,6 +623,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        if (server->rxbuf == NULL)
                goto out_txbuf;
+        lock_sock(sock->sk);
+        server->data_ready      = sock->sk->sk_data_ready;
+        server->write_space     = sock->sk->sk_write_space;
+        server->error_report    = sock->sk->sk_error_report;
+        sock->sk->sk_user_data  = server;
        sock->sk->sk_data_ready   = ncp_tcp_data_ready;
        sock->sk->sk_error_report = ncp_tcp_error_report;
        if (sock->type == SOCK_STREAM) {
@@ -634,6 +643,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
                server->timeout_tm.data = (unsigned long)server;
                server->timeout_tm.function = ncpdgram_timeout_call;
        }
+        release_sock(sock->sk);
        ncp_lock_server(server);
        error = ncp_connect(server);
@@ -658,8 +668,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
                                goto out_disconnect;
                        }
                }
+                ncp_lock_server(server);
                if (options & 2)
                        server->sign_wanted = 1;
+                ncp_unlock_server(server);
        }
        else 
 #endif  /* CONFIG_NCPFS_PACKET_SIGNING */
@@ -720,6 +732,9 @@ out_nls:
        unload_nls(server->nls_io);
        unload_nls(server->nls_vol);
 #endif
+        mutex_destroy(&server->rcv.creq_mutex);
+        mutex_destroy(&server->root_setup_lock);
+        mutex_destroy(&server->mutex);
 out_fput2:
        if (server->info_filp)
                fput(server->info_filp);
@@ -743,8 +758,6 @@ static void ncp_put_super(struct super_block *sb)
 {
        struct ncp_server *server = NCP_SBP(sb);
-        lock_kernel();
        ncp_lock_server(server);
        ncp_disconnect(server);
        ncp_unlock_server(server);
@@ -756,6 +769,9 @@ static void ncp_put_super(struct super_block *sb)
        unload_nls(server->nls_vol);
        unload_nls(server->nls_io);
 #endif /* CONFIG_NCPFS_NLS */
+        mutex_destroy(&server->rcv.creq_mutex);
+        mutex_destroy(&server->root_setup_lock);
+        mutex_destroy(&server->mutex);
        if (server->info_filp)
                fput(server->info_filp);
@@ -771,8 +787,6 @@ static void ncp_put_super(struct super_block *sb)
        vfree(server->packet);
        sb->s_fs_info = NULL;
        kfree(server);
-        unlock_kernel();
 }
 static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -851,10 +865,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
        result = -EIO;
-        lock_kernel();  
        server = NCP_SERVER(inode);
-        if ((!server) || !ncp_conn_valid(server))
+        if (!server)    /* How this could happen? */
                goto out;
        /* ageing the dentry to force validation */
@@ -981,8 +993,6 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
                result = ncp_modify_file_or_subdir_dos_info(NCP_SERVER(inode),
                                      inode, info_mask, &info);
                if (result != 0) {
-                        result = -EACCES;
                        if (info_mask == (DM_CREATE_TIME | DM_CREATE_DATE)) {
                                /* NetWare seems not to allow this. I
                                   do not know why. So, just tell the
@@ -1005,20 +1015,21 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
        mark_inode_dirty(inode);
 out:
-        unlock_kernel();
+        if (result > 0)
+                result = -EACCES;
        return result;
 }
-static int ncp_get_sb(struct file_system_type *fs_type,
+static struct dentry *ncp_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, ncp_fill_super);
 }
 static struct file_system_type ncp_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ncpfs",
-        .get_sb         = ncp_get_sb,
+        .mount          = ncp_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = FS_BINARY_MOUNTDATA,
 };
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 84a8cfc4e38e..c2a1f9a155c3 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -35,16 +35,11 @@
 #define NCP_PACKET_SIZE_INTERNAL 65536
 static int
-ncp_get_fs_info(struct ncp_server * server, struct file *file,
+ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
                struct ncp_fs_info __user *arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
        struct ncp_fs_info info;
-        if (file_permission(file, MAY_WRITE) != 0
-            && current_uid() != server->m.mounted_uid)
-                return -EACCES;
        if (copy_from_user(&info, arg, sizeof(info)))
                return -EFAULT;
@@ -65,16 +60,11 @@ ncp_get_fs_info(struct ncp_server * server, struct file *file,
 }
 static int
-ncp_get_fs_info_v2(struct ncp_server * server, struct file *file,
+ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
                   struct ncp_fs_info_v2 __user * arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
        struct ncp_fs_info_v2 info2;
-        if (file_permission(file, MAY_WRITE) != 0
-            && current_uid() != server->m.mounted_uid)
-                return -EACCES;
        if (copy_from_user(&info2, arg, sizeof(info2)))
                return -EFAULT;
@@ -136,16 +126,11 @@ struct compat_ncp_privatedata_ioctl
 #define NCP_IOC_SETPRIVATEDATA_32       _IOR('n', 10, struct compat_ncp_privatedata_ioctl)
 static int
-ncp_get_compat_fs_info_v2(struct ncp_server * server, struct file *file,
+ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
                   struct compat_ncp_fs_info_v2 __user * arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
        struct compat_ncp_fs_info_v2 info2;
-        if (file_permission(file, MAY_WRITE) != 0
-            && current_uid() != server->m.mounted_uid)
-                return -EACCES;
        if (copy_from_user(&info2, arg, sizeof(info2)))
                return -EFAULT;
@@ -182,11 +167,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
        struct nls_table *iocharset;
        struct nls_table *oldset_io;
        struct nls_table *oldset_cp;
+        int utf8;
-        if (!capable(CAP_SYS_ADMIN))
+        int err;
-                return -EACCES;
-        if (server->root_setuped)
-                return -EBUSY;
        if (copy_from_user(&user, arg, sizeof(user)))
                return -EFAULT;
@@ -206,28 +188,40 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
        user.iocharset[NCP_IOCSNAME_LEN] = 0;
        if (!user.iocharset[0] || !strcmp(user.iocharset, "default")) {
                iocharset = load_nls_default();
-                NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+                utf8 = 0;
        } else if (!strcmp(user.iocharset, "utf8")) {
                iocharset = load_nls_default();
-                NCP_SET_FLAG(server, NCP_FLAG_UTF8);
+                utf8 = 1;
        } else {
                iocharset = load_nls(user.iocharset);
                if (!iocharset) {
                        unload_nls(codepage);
                        return -EBADRQC;
                }
-                NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+                utf8 = 0;
        }
-        oldset_cp = server->nls_vol;
+        mutex_lock(&server->root_setup_lock);
-        server->nls_vol = codepage;
+        if (server->root_setuped) {
-        oldset_io = server->nls_io;
+                oldset_cp = codepage;
-        server->nls_io = iocharset;
+                oldset_io = iocharset;
+                err = -EBUSY;
+        } else {
+                if (utf8)
+                        NCP_SET_FLAG(server, NCP_FLAG_UTF8);
+                else
+                        NCP_CLR_FLAG(server, NCP_FLAG_UTF8);
+                oldset_cp = server->nls_vol;
+                server->nls_vol = codepage;
+                oldset_io = server->nls_io;
+                server->nls_io = iocharset;
+                err = 0;
+        }
+        mutex_unlock(&server->root_setup_lock);
        unload_nls(oldset_cp);
        unload_nls(oldset_io);
-        return 0;
+        return err;
 }
 static int
@@ -237,6 +231,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
        int len;
        memset(&user, 0, sizeof(user));
+        mutex_lock(&server->root_setup_lock);
        if (server->nls_vol && server->nls_vol->charset) {
                len = strlen(server->nls_vol->charset);
                if (len > NCP_IOCSNAME_LEN)
@@ -254,6 +249,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
                strncpy(user.iocharset, server->nls_io->charset, len);
                user.iocharset[len] = 0;
        }
+        mutex_unlock(&server->root_setup_lock);
        if (copy_to_user(arg, &user, sizeof(user)))
                return -EFAULT;
@@ -261,25 +257,19 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 }
 #endif /* CONFIG_NCPFS_NLS */
-static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg)
 {
-        struct inode *inode = filp->f_dentry->d_inode;
        struct ncp_server *server = NCP_SERVER(inode);
        int result;
        struct ncp_ioctl_request request;
        char* bouncebuffer;
        void __user *argp = (void __user *)arg;
-        uid_t uid = current_uid();
        switch (cmd) {
 #ifdef CONFIG_COMPAT
        case NCP_IOC_NCPREQUEST_32:
 #endif
        case NCP_IOC_NCPREQUEST:
-                if (file_permission(filp, MAY_WRITE) != 0
-                    && uid != server->m.mounted_uid)
-                        return -EACCES;
 #ifdef CONFIG_COMPAT
                if (cmd == NCP_IOC_NCPREQUEST_32) {
                        struct compat_ncp_ioctl_request request32;
@@ -314,7 +304,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                server->current_size = request.size;
                memcpy(server->packet, bouncebuffer, request.size);
-                result = ncp_request2(server, request.function, 
+                result = ncp_request2(server, request.function,
                        bouncebuffer, NCP_PACKET_SIZE_INTERNAL);
                if (result < 0)
                        result = -EIO;
@@ -331,69 +321,69 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        case NCP_IOC_CONN_LOGGED_IN:
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EACCES;
                if (!(server->m.int_flags & NCP_IMOUNT_LOGGEDIN_POSSIBLE))
                        return -EINVAL;
+                mutex_lock(&server->root_setup_lock);
                if (server->root_setuped)
-                        return -EBUSY;
+                        result = -EBUSY;
-                server->root_setuped = 1;
+                else {
-                return ncp_conn_logged_in(inode->i_sb);
+                        result = ncp_conn_logged_in(inode->i_sb);
+                        if (result == 0)
+                                server->root_setuped = 1;
+                }
+                mutex_unlock(&server->root_setup_lock);
+                return result;
        case NCP_IOC_GET_FS_INFO:
-                return ncp_get_fs_info(server, filp, argp);
+                return ncp_get_fs_info(server, inode, argp);
        case NCP_IOC_GET_FS_INFO_V2:
-                return ncp_get_fs_info_v2(server, filp, argp);
+                return ncp_get_fs_info_v2(server, inode, argp);
 #ifdef CONFIG_COMPAT
        case NCP_IOC_GET_FS_INFO_V2_32:
-                return ncp_get_compat_fs_info_v2(server, filp, argp);
+                return ncp_get_compat_fs_info_v2(server, inode, argp);
 #endif
        /* we have too many combinations of CONFIG_COMPAT,
         * CONFIG_64BIT and CONFIG_UID16, so just handle
         * any of the possible ioctls */
        case NCP_IOC_GETMOUNTUID16:
-        case NCP_IOC_GETMOUNTUID32:
+                {
-        case NCP_IOC_GETMOUNTUID64:
-                if (file_permission(filp, MAY_READ) != 0
-                        && uid != server->m.mounted_uid)
-                        return -EACCES;
-                if (cmd == NCP_IOC_GETMOUNTUID16) {
                        u16 uid;
                        SET_UID(uid, server->m.mounted_uid);
                        if (put_user(uid, (u16 __user *)argp))
                                return -EFAULT;
-                } else if (cmd == NCP_IOC_GETMOUNTUID32) {
+                        return 0;
-                        if (put_user(server->m.mounted_uid,
-                                                (u32 __user *)argp))
-                                return -EFAULT;
-                } else {
-                        if (put_user(server->m.mounted_uid,
-                                                (u64 __user *)argp))
-                                return -EFAULT;
                }
+        case NCP_IOC_GETMOUNTUID32:
+                if (put_user(server->m.mounted_uid,
+                             (u32 __user *)argp))
+                        return -EFAULT;
+                return 0;
+        case NCP_IOC_GETMOUNTUID64:
+                if (put_user(server->m.mounted_uid,
+                             (u64 __user *)argp))
+                        return -EFAULT;
                return 0;
        case NCP_IOC_GETROOT:
                {
                        struct ncp_setroot_ioctl sr;
-                        if (file_permission(filp, MAY_READ) != 0
+                        result = -EACCES;
-                            && uid != server->m.mounted_uid)
+                        mutex_lock(&server->root_setup_lock);
-                                return -EACCES;
                        if (server->m.mounted_vol[0]) {
                                struct dentry* dentry = inode->i_sb->s_root;
                                if (dentry) {
                                        struct inode* s_inode = dentry->d_inode;
-                                
                                        if (s_inode) {
                                                sr.volNumber = NCP_FINFO(s_inode)->volNumber;
                                                sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum;
                                                sr.namespace = server->name_space[sr.volNumber];
+                                                result = 0;
                                        } else
                                                DPRINTK("ncpfs: s_root->d_inode==NULL\n");
                                } else
@@ -402,10 +392,12 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                                sr.volNumber = -1;
                                sr.namespace = 0;
                                sr.dirEntNum = 0;
+                                result = 0;
                        }
-                        if (copy_to_user(argp, &sr, sizeof(sr)))
+                        mutex_unlock(&server->root_setup_lock);
-                                return -EFAULT;
+                        if (!result && copy_to_user(argp, &sr, sizeof(sr)))
-                        return 0;
+                                result = -EFAULT;
+                        return result;
                }
        case NCP_IOC_SETROOT:
@@ -416,103 +408,114 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        __le32 dosde;
                        struct dentry* dentry;
-                        if (!capable(CAP_SYS_ADMIN))
-                        {
-                                return -EACCES;
-                        }
-                        if (server->root_setuped) return -EBUSY;
                        if (copy_from_user(&sr, argp, sizeof(sr)))
                                return -EFAULT;
-                        if (sr.volNumber < 0) {
+                        mutex_lock(&server->root_setup_lock);
-                                server->m.mounted_vol[0] = 0;
+                        if (server->root_setuped)
-                                vnum = NCP_NUMBER_OF_VOLUMES;
+                                result = -EBUSY;
-                                de = 0;
+                        else {
-                                dosde = 0;
+                                if (sr.volNumber < 0) {
-                        } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
+                                        server->m.mounted_vol[0] = 0;
-                                return -EINVAL;
+                                        vnum = NCP_NUMBER_OF_VOLUMES;
-                        } else if (ncp_mount_subdir(server, sr.volNumber,
+                                        de = 0;
-                                                sr.namespace, sr.dirEntNum,
+                                        dosde = 0;
-                                                &vnum, &de, &dosde)) {
+                                        result = 0;
-                                return -ENOENT;
+                                } else if (sr.volNumber >= NCP_NUMBER_OF_VOLUMES) {
-                        }
+                                        result = -EINVAL;
-                        
+                                } else if (ncp_mount_subdir(server, sr.volNumber,
-                        dentry = inode->i_sb->s_root;
+                                                        sr.namespace, sr.dirEntNum,
-                        server->root_setuped = 1;
+                                                        &vnum, &de, &dosde)) {
-                        if (dentry) {
+                                        result = -ENOENT;
-                                struct inode* s_inode = dentry->d_inode;
-                                
-                                if (s_inode) {
-                                        NCP_FINFO(s_inode)->volNumber = vnum;
-                                        NCP_FINFO(s_inode)->dirEntNum = de;
-                                        NCP_FINFO(s_inode)->DosDirNum = dosde;
                                } else
-                                        DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+                                        result = 0;
-                        } else
-                                DPRINTK("ncpfs: s_root==NULL\n");
+                                if (result == 0) {
+                                        dentry = inode->i_sb->s_root;
+                                        if (dentry) {
+                                                struct inode* s_inode = dentry->d_inode;
+                                                if (s_inode) {
+                                                        NCP_FINFO(s_inode)->volNumber = vnum;
+                                                        NCP_FINFO(s_inode)->dirEntNum = de;
+                                                        NCP_FINFO(s_inode)->DosDirNum = dosde;
+                                                        server->root_setuped = 1;
+                                                } else {
+                                                        DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+                                                        result = -EIO;
+                                                }
+                                        } else {
+                                                DPRINTK("ncpfs: s_root==NULL\n");
+                                                result = -EIO;
+                                        }
+                                }
+                                result = 0;
+                        }
+                        mutex_unlock(&server->root_setup_lock);
-                        return 0;
+                        return result;
                }
-#ifdef CONFIG_NCPFS_PACKET_SIGNING      
+#ifdef CONFIG_NCPFS_PACKET_SIGNING
        case NCP_IOC_SIGN_INIT:
-                if (file_permission(filp, MAY_WRITE) != 0
+                {
-                    && uid != server->m.mounted_uid)
+                        struct ncp_sign_init sign;
-                        return -EACCES;
-                if (argp) {
-                        if (server->sign_wanted)
-                        {
-                                struct ncp_sign_init sign;
+                        if (argp)
                                if (copy_from_user(&sign, argp, sizeof(sign)))
                                        return -EFAULT;
-                                memcpy(server->sign_root,sign.sign_root,8);
+                        ncp_lock_server(server);
-                                memcpy(server->sign_last,sign.sign_last,16);
+                        mutex_lock(&server->rcv.creq_mutex);
-                                server->sign_active = 1;
+                        if (argp) {
+                                if (server->sign_wanted) {
+                                        memcpy(server->sign_root,sign.sign_root,8);
+                                        memcpy(server->sign_last,sign.sign_last,16);
+                                        server->sign_active = 1;
+                                }
+                                /* ignore when signatures not wanted */
+                        } else {
+                                server->sign_active = 0;
                        }
-                        /* ignore when signatures not wanted */
+                        mutex_unlock(&server->rcv.creq_mutex);
-                } else {
+                        ncp_unlock_server(server);
-                        server->sign_active = 0;
+                        return 0;
                }
-                return 0;               
-                
        case NCP_IOC_SIGN_WANTED:
-                if (file_permission(filp, MAY_READ) != 0
+                {
-                    && uid != server->m.mounted_uid)
+                        int state;
-                        return -EACCES;
-                
+                        ncp_lock_server(server);
-                if (put_user(server->sign_wanted, (int __user *)argp))
+                        state = server->sign_wanted;
-                        return -EFAULT;
+                        ncp_unlock_server(server);
-                return 0;
+                        if (put_user(state, (int __user *)argp))
+                                return -EFAULT;
+                        return 0;
+                }
        case NCP_IOC_SET_SIGN_WANTED:
                {
                        int newstate;
-                        if (file_permission(filp, MAY_WRITE) != 0
-                            && uid != server->m.mounted_uid)
-                                return -EACCES;
                        /* get only low 8 bits... */
                        if (get_user(newstate, (unsigned char __user *)argp))
                                return -EFAULT;
+                        result = 0;
+                        ncp_lock_server(server);
                        if (server->sign_active) {
                                /* cannot turn signatures OFF when active */
-                                if (!newstate) return -EINVAL;
+                                if (!newstate)
+                                        result = -EINVAL;
                        } else {
                                server->sign_wanted = newstate != 0;
                        }
-                        return 0;
+                        ncp_unlock_server(server);
+                        return result;
                }
 #endif /* CONFIG_NCPFS_PACKET_SIGNING */
 #ifdef CONFIG_NCPFS_IOCTL_LOCKING
        case NCP_IOC_LOCKUNLOCK:
-                if (file_permission(filp, MAY_WRITE) != 0
-                    && uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        struct ncp_lock_ioctl    rqdata;
@@ -541,16 +544,13 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        {
                                return result;
                        }
-                        result = -EIO;
-                        if (!ncp_conn_valid(server))
-                                goto outrel;
                        result = -EISDIR;
                        if (!S_ISREG(inode->i_mode))
                                goto outrel;
                        if (rqdata.cmd == NCP_LOCK_CLEAR)
                        {
                                result = ncp_ClearPhysicalRecord(NCP_SERVER(inode),
-                                                        NCP_FINFO(inode)->file_handle, 
+                                                        NCP_FINFO(inode)->file_handle,
                                                        rqdata.offset,
                                                        rqdata.length);
                                if (result > 0) result = 0;     /* no such lock */
@@ -573,7 +573,7 @@ static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                                                        rqdata.timeout);
                                if (result > 0) result = -EAGAIN;
                        }
-outrel:                 
+outrel:
                        ncp_inode_close(inode);
                        return result;
                }
@@ -581,60 +581,62 @@ outrel:
 #ifdef CONFIG_COMPAT
        case NCP_IOC_GETOBJECTNAME_32:
-                if (uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        struct compat_ncp_objectname_ioctl user;
                        size_t outl;
                        if (copy_from_user(&user, argp, sizeof(user)))
                                return -EFAULT;
+                        down_read(&server->auth_rwsem);
                        user.auth_type = server->auth.auth_type;
                        outl = user.object_name_len;
                        user.object_name_len = server->auth.object_name_len;
                        if (outl > user.object_name_len)
                                outl = user.object_name_len;
+                        result = 0;
                        if (outl) {
                                if (copy_to_user(compat_ptr(user.object_name),
                                                 server->auth.object_name,
-                                                 outl)) return -EFAULT;
+                                                 outl))
+                                        result = -EFAULT;
                        }
-                        if (copy_to_user(argp, &user, sizeof(user)))
+                        up_read(&server->auth_rwsem);
-                                return -EFAULT;
+                        if (!result && copy_to_user(argp, &user, sizeof(user)))
-                        return 0;
+                                result = -EFAULT;
+                        return result;
                }
 #endif
        case NCP_IOC_GETOBJECTNAME:
-                if (uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        struct ncp_objectname_ioctl user;
                        size_t outl;
                        if (copy_from_user(&user, argp, sizeof(user)))
                                return -EFAULT;
+                        down_read(&server->auth_rwsem);
                        user.auth_type = server->auth.auth_type;
                        outl = user.object_name_len;
                        user.object_name_len = server->auth.object_name_len;
                        if (outl > user.object_name_len)
                                outl = user.object_name_len;
+                        result = 0;
                        if (outl) {
                                if (copy_to_user(user.object_name,
                                                 server->auth.object_name,
-                                                 outl)) return -EFAULT;
+                                                 outl))
+                                        result = -EFAULT;
                        }
-                        if (copy_to_user(argp, &user, sizeof(user)))
+                        up_read(&server->auth_rwsem);
-                                return -EFAULT;
+                        if (!result && copy_to_user(argp, &user, sizeof(user)))
-                        return 0;
+                                result = -EFAULT;
+                        return result;
                }
 #ifdef CONFIG_COMPAT
        case NCP_IOC_SETOBJECTNAME_32:
 #endif
        case NCP_IOC_SETOBJECTNAME:
-                if (uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        struct ncp_objectname_ioctl user;
                        void* newname;
@@ -666,9 +668,7 @@ outrel:
                        } else {
                                newname = NULL;
                        }
-                        /* enter critical section */
+                        down_write(&server->auth_rwsem);
-                        /* maybe that kfree can sleep so do that this way */
-                        /* it is at least more SMP friendly (in future...) */
                        oldname = server->auth.object_name;
                        oldnamelen = server->auth.object_name_len;
                        oldprivate = server->priv.data;
@@ -678,7 +678,7 @@ outrel:
                        server->auth.object_name = newname;
                        server->priv.len = 0;
                        server->priv.data = NULL;
-                        /* leave critical section */
+                        up_write(&server->auth_rwsem);
                        kfree(oldprivate);
                        kfree(oldname);
                        return 0;
@@ -688,8 +688,6 @@ outrel:
        case NCP_IOC_GETPRIVATEDATA_32:
 #endif
        case NCP_IOC_GETPRIVATEDATA:
-                if (uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        struct ncp_privatedata_ioctl user;
                        size_t outl;
@@ -706,14 +704,20 @@ outrel:
                        if (copy_from_user(&user, argp, sizeof(user)))
                                return -EFAULT;
+                        down_read(&server->auth_rwsem);
                        outl = user.len;
                        user.len = server->priv.len;
                        if (outl > user.len) outl = user.len;
+                        result = 0;
                        if (outl) {
                                if (copy_to_user(user.data,
                                                 server->priv.data,
-                                                 outl)) return -EFAULT;
+                                                 outl))
+                                        result = -EFAULT;
                        }
+                        up_read(&server->auth_rwsem);
+                        if (result)
+                                return result;
 #ifdef CONFIG_COMPAT
                        if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
                                struct compat_ncp_privatedata_ioctl user32;
@@ -733,8 +737,6 @@ outrel:
        case NCP_IOC_SETPRIVATEDATA_32:
 #endif
        case NCP_IOC_SETPRIVATEDATA:
-                if (uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        struct ncp_privatedata_ioctl user;
                        void* new;
@@ -762,12 +764,12 @@ outrel:
                        } else {
                                new = NULL;
                        }
-                        /* enter critical section */
+                        down_write(&server->auth_rwsem);
                        old = server->priv.data;
                        oldlen = server->priv.len;
                        server->priv.len = user.len;
                        server->priv.data = new;
-                        /* leave critical section */
+                        up_write(&server->auth_rwsem);
                        kfree(old);
                        return 0;
                }
@@ -775,17 +777,13 @@ outrel:
 #ifdef CONFIG_NCPFS_NLS
        case NCP_IOC_SETCHARSETS:
                return ncp_set_charsets(server, argp);
-                
        case NCP_IOC_GETCHARSETS:
                return ncp_get_charsets(server, argp);
 #endif /* CONFIG_NCPFS_NLS */
        case NCP_IOC_SETDENTRYTTL:
-                if (file_permission(filp, MAY_WRITE) != 0 &&
-                    uid != server->m.mounted_uid)
-                        return -EACCES;
                {
                        u_int32_t user;
@@ -795,13 +793,13 @@ outrel:
                        if (user > 20000)
                                return -EINVAL;
                        user = (user * HZ) / 1000;
-                        server->dentry_ttl = user;
+                        atomic_set(&server->dentry_ttl, user);
                        return 0;
                }
-                
        case NCP_IOC_GETDENTRYTTL:
                {
-                        u_int32_t user = (server->dentry_ttl * 1000) / HZ;
+                        u_int32_t user = (atomic_read(&server->dentry_ttl) * 1000) / HZ;
                        if (copy_to_user(argp, &user, sizeof(user)))
                                return -EFAULT;
                        return 0;
@@ -811,59 +809,103 @@ outrel:
        return -EINVAL;
 }
-static int ncp_ioctl_need_write(unsigned int cmd)
+long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct ncp_server *server = NCP_SERVER(inode);
+        uid_t uid = current_uid();
+        int need_drop_write = 0;
+        long ret;
        switch (cmd) {
-        case NCP_IOC_GET_FS_INFO:
-        case NCP_IOC_GET_FS_INFO_V2:
-        case NCP_IOC_NCPREQUEST:
-        case NCP_IOC_SETDENTRYTTL:
-        case NCP_IOC_SIGN_INIT:
-        case NCP_IOC_LOCKUNLOCK:
-        case NCP_IOC_SET_SIGN_WANTED:
-                return 1;
-        case NCP_IOC_GETOBJECTNAME:
-        case NCP_IOC_SETOBJECTNAME:
-        case NCP_IOC_GETPRIVATEDATA:
-        case NCP_IOC_SETPRIVATEDATA:
        case NCP_IOC_SETCHARSETS:
-        case NCP_IOC_GETCHARSETS:
        case NCP_IOC_CONN_LOGGED_IN:
-        case NCP_IOC_GETDENTRYTTL:
-        case NCP_IOC_GETMOUNTUID2:
-        case NCP_IOC_SIGN_WANTED:
-        case NCP_IOC_GETROOT:
        case NCP_IOC_SETROOT:
-                return 0;
+                if (!capable(CAP_SYS_ADMIN)) {
-        default:
+                        ret = -EACCES;
-                /* unknown IOCTL command, assume write */
+                        goto out;
-                return 1;
+                }
+                break;
        }
-}
+        if (server->m.mounted_uid != uid) {
+                switch (cmd) {
-long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-        long ret;
-        lock_kernel();
-        if (ncp_ioctl_need_write(cmd)) {
                /*
-                 * inside the ioctl(), any failures which
+                 * Only mount owner can issue these ioctls.  Information
-                 * are because of file_permission() are
+                 * necessary to authenticate to other NDS servers are
-                 * -EACCESS, so it seems consistent to keep
+                 * stored here.
-                 *  that here.
                 */
-                if (mnt_want_write(filp->f_path.mnt)) {
+                case NCP_IOC_GETOBJECTNAME:
+                case NCP_IOC_SETOBJECTNAME:
+                case NCP_IOC_GETPRIVATEDATA:
+                case NCP_IOC_SETPRIVATEDATA:
+#ifdef CONFIG_COMPAT
+                case NCP_IOC_GETOBJECTNAME_32:
+                case NCP_IOC_SETOBJECTNAME_32:
+                case NCP_IOC_GETPRIVATEDATA_32:
+                case NCP_IOC_SETPRIVATEDATA_32:
+#endif
                        ret = -EACCES;
                        goto out;
+                /*
+                 * These require write access on the inode if user id
+                 * does not match.  Note that they do not write to the
+                 * file...  But old code did mnt_want_write, so I keep
+                 * it as is.  Of course not for mountpoint owner, as
+                 * that breaks read-only mounts altogether as ncpmount
+                 * needs working NCP_IOC_NCPREQUEST and
+                 * NCP_IOC_GET_FS_INFO.  Some of these codes (setdentryttl,
+                 * signinit, setsignwanted) should be probably restricted
+                 * to owner only, or even more to CAP_SYS_ADMIN).
+                 */
+                case NCP_IOC_GET_FS_INFO:
+                case NCP_IOC_GET_FS_INFO_V2:
+                case NCP_IOC_NCPREQUEST:
+                case NCP_IOC_SETDENTRYTTL:
+                case NCP_IOC_SIGN_INIT:
+                case NCP_IOC_LOCKUNLOCK:
+                case NCP_IOC_SET_SIGN_WANTED:
+#ifdef CONFIG_COMPAT
+                case NCP_IOC_GET_FS_INFO_V2_32:
+                case NCP_IOC_NCPREQUEST_32:
+#endif
+                        ret = mnt_want_write_file(filp);
+                        if (ret)
+                                goto out;
+                        need_drop_write = 1;
+                        ret = inode_permission(inode, MAY_WRITE);
+                        if (ret)
+                                goto outDropWrite;
+                        break;
+                /*
+                 * Read access required.
+                 */
+                case NCP_IOC_GETMOUNTUID16:
+                case NCP_IOC_GETMOUNTUID32:
+                case NCP_IOC_GETMOUNTUID64:
+                case NCP_IOC_GETROOT:
+                case NCP_IOC_SIGN_WANTED:
+                        ret = inode_permission(inode, MAY_READ);
+                        if (ret)
+                                goto out;
+                        break;
+                /*
+                 * Anybody can read these.
+                 */
+                case NCP_IOC_GETCHARSETS:
+                case NCP_IOC_GETDENTRYTTL:
+                default:
+                /* Three codes below are protected by CAP_SYS_ADMIN above. */
+                case NCP_IOC_SETCHARSETS:
+                case NCP_IOC_CONN_LOGGED_IN:
+                case NCP_IOC_SETROOT:
+                        break;
                }
        }
-        ret = __ncp_ioctl(filp, cmd, arg);
+        ret = __ncp_ioctl(inode, cmd, arg);
-        if (ncp_ioctl_need_write(cmd))
+outDropWrite:
+        if (need_drop_write)
                mnt_drop_write(filp->f_path.mnt);
 out:
-        unlock_kernel();
        return ret;
 }
@@ -872,10 +914,8 @@ long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        long ret;
-        lock_kernel();
        arg = (unsigned long) compat_ptr(arg);
        ret = ncp_ioctl(file, cmd, arg);
-        unlock_kernel();
        return ret;
 }
 #endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 0ec6237a5970..a95615a0b6ac 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -107,17 +107,17 @@ ncp_reply_data(struct ncp_server *server, int offset)
        return &(server->packet[sizeof(struct ncp_reply_header) + offset]);
 }
-static inline u8 BVAL(void *data)
+static inline u8 BVAL(const void *data)
 {
-        return *(u8 *)data;
+        return *(const u8 *)data;
 }
 static u8 ncp_reply_byte(struct ncp_server *server, int offset)
 {
-        return *(u8 *)ncp_reply_data(server, offset);
+        return *(const u8 *)ncp_reply_data(server, offset);
 }
-static inline u16 WVAL_LH(void *data)
+static inline u16 WVAL_LH(const void *data)
 {
        return get_unaligned_le16(data);
 }
@@ -134,7 +134,7 @@ ncp_reply_be16(struct ncp_server *server, int offset)
        return get_unaligned_be16(ncp_reply_data(server, offset));
 }
-static inline u32 DVAL_LH(void *data)
+static inline u32 DVAL_LH(const void *data)
 {
        return get_unaligned_le32(data);
 }
@@ -349,9 +349,9 @@ int ncp_dirhandle_free(struct ncp_server* server, __u8 dirhandle) {
        return result;
 }
-void ncp_extract_file_info(void *structure, struct nw_info_struct *target)
+void ncp_extract_file_info(const void *structure, struct nw_info_struct *target)
 {
-        __u8 *name_len;
+        const __u8 *name_len;
        const int info_struct_size = offsetof(struct nw_info_struct, nameLen);
        memcpy(target, structure, info_struct_size);
@@ -364,7 +364,7 @@ void ncp_extract_file_info(void *structure, struct nw_info_struct *target)
 }
 #ifdef CONFIG_NCPFS_NFS_NS
-static inline void ncp_extract_nfs_info(unsigned char *structure,
+static inline void ncp_extract_nfs_info(const unsigned char *structure,
                                 struct nw_nfs_info *target)
 {
        target->mode = DVAL_LH(structure);
@@ -417,7 +417,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server,
 * Returns information for a (one-component) name relative to
 * the specified directory.
 */
-int ncp_obtain_info(struct ncp_server *server, struct inode *dir, char *path,
+int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *path,
                        struct nw_info_struct *target)
 {
        __u8  volnum = NCP_FINFO(dir)->volNumber;
@@ -452,16 +452,16 @@ out:
 #ifdef CONFIG_NCPFS_NFS_NS
 static int
 ncp_obtain_DOS_dir_base(struct ncp_server *server,
-                __u8 volnum, __le32 dirent,
+                __u8 ns, __u8 volnum, __le32 dirent,
-                char *path, /* At most 1 component */
+                const char *path, /* At most 1 component */
                __le32 *DOS_dir_base)
 {
        int result;
        ncp_init_request(server);
        ncp_add_byte(server, 6); /* subfunction */
-        ncp_add_byte(server, server->name_space[volnum]);
+        ncp_add_byte(server, ns);
-        ncp_add_byte(server, server->name_space[volnum]);
+        ncp_add_byte(server, ns);
        ncp_add_word(server, cpu_to_le16(0x8006)); /* get all */
        ncp_add_dword(server, RIM_DIRECTORY);
        ncp_add_handle_path(server, volnum, dirent, 1, path);
@@ -523,10 +523,27 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
 #endif  /* defined(CONFIG_NCPFS_OS2_NS) || defined(CONFIG_NCPFS_NFS_NS) */
 }
+int
+ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
+{
+        int ns = ncp_get_known_namespace(server, volume);
+        if (ret_ns)
+                *ret_ns = ns;
+        DPRINTK("lookup_vol: namespace[%d] = %d\n",
+                volume, server->name_space[volume]);
+        if (server->name_space[volume] == ns)
+                return 0;
+        server->name_space[volume] = ns;
+        return 1;
+}
 static int
 ncp_ObtainSpecificDirBase(struct ncp_server *server,
                __u8 nsSrc, __u8 nsDst, __u8 vol_num, __le32 dir_base,
-                char *path, /* At most 1 component */
+                const char *path, /* At most 1 component */
                __le32 *dirEntNum, __le32 *DosDirNum)
 {
        int result;
@@ -560,14 +577,13 @@ ncp_mount_subdir(struct ncp_server *server,
 {
        int dstNS;
        int result;
-        
-        dstNS = ncp_get_known_namespace(server, volNumber);
+        ncp_update_known_namespace(server, volNumber, &dstNS);
        if ((result = ncp_ObtainSpecificDirBase(server, srcNS, dstNS, volNumber, 
                                      dirEntNum, NULL, newDirEnt, newDosEnt)) != 0)
        {
                return result;
        }
-        server->name_space[volNumber] = dstNS;
        *volume = volNumber;
        server->m.mounted_vol[1] = 0;
        server->m.mounted_vol[0] = 'X';
@@ -575,11 +591,10 @@ ncp_mount_subdir(struct ncp_server *server,
 }
 int 
-ncp_get_volume_root(struct ncp_server *server, const char *volname,
+ncp_get_volume_root(struct ncp_server *server,
-                    __u32* volume, __le32* dirent, __le32* dosdirent)
+                    const char *volname, __u32* volume, __le32* dirent, __le32* dosdirent)
 {
        int result;
-        __u8 volnum;
        DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname);
@@ -601,21 +616,14 @@ ncp_get_volume_root(struct ncp_server *server, const char *volname,
                return result;
        }
        *dirent = *dosdirent = ncp_reply_dword(server, 4);
-        volnum = ncp_reply_byte(server, 8);
+        *volume = ncp_reply_byte(server, 8);
        ncp_unlock_server(server);
-        *volume = volnum;
-        server->name_space[volnum] = ncp_get_known_namespace(server, volnum);
-        DPRINTK("lookup_vol: namespace[%d] = %d\n",
-                volnum, server->name_space[volnum]);
        return 0;
 }
 int
-ncp_lookup_volume(struct ncp_server *server, const char *volname,
+ncp_lookup_volume(struct ncp_server *server,
-                  struct nw_info_struct *target)
+                  const char *volname, struct nw_info_struct *target)
 {
        int result;
@@ -625,6 +633,7 @@ ncp_lookup_volume(struct ncp_server *server, const char *volname,
        if (result) {
                return result;
        }
+        ncp_update_known_namespace(server, target->volNumber, NULL);
        target->nameLen = strlen(volname);
        memcpy(target->entryName, volname, target->nameLen+1);
        target->attributes = aDIR;
@@ -676,8 +685,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
 {
        int result = 0;
+        ncp_init_request(server);
        if (server->name_space[volnum] == NW_NS_NFS) {
-                ncp_init_request(server);
                ncp_add_byte(server, 25);       /* subfunction */
                ncp_add_byte(server, server->name_space[volnum]);
                ncp_add_byte(server, NW_NS_NFS);
@@ -690,8 +699,8 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
                ncp_add_dword_lh(server, 1);    /* nlinks */
                ncp_add_dword_lh(server, rdev);
                result = ncp_request(server, 87);
-                ncp_unlock_server(server);
        }
+        ncp_unlock_server(server);
        return result;
 }
 #endif
@@ -700,7 +709,7 @@ int ncp_modify_nfs_info(struct ncp_server *server, __u8 volnum, __le32 dirent,
 static int
 ncp_DeleteNSEntry(struct ncp_server *server,
                  __u8 have_dir_base, __u8 volnum, __le32 dirent,
-                  char* name, __u8 ns, __le16 attr)
+                  const char* name, __u8 ns, __le16 attr)
 {
        int result;
@@ -734,23 +743,25 @@ ncp_del_file_or_subdir2(struct ncp_server *server,
 int
 ncp_del_file_or_subdir(struct ncp_server *server,
-                       struct inode *dir, char *name)
+                       struct inode *dir, const char *name)
 {
        __u8  volnum = NCP_FINFO(dir)->volNumber;
        __le32 dirent = NCP_FINFO(dir)->dirEntNum;
+        int name_space;
+        name_space = server->name_space[volnum];
 #ifdef CONFIG_NCPFS_NFS_NS
-        if (server->name_space[volnum]==NW_NS_NFS)
+        if (name_space == NW_NS_NFS)
        {
                int result;
 
-                result=ncp_obtain_DOS_dir_base(server, volnum, dirent, name, &dirent);
+                result=ncp_obtain_DOS_dir_base(server, name_space, volnum, dirent, name, &dirent);
                if (result) return result;
-                return ncp_DeleteNSEntry(server, 1, volnum, dirent, NULL, NW_NS_DOS, cpu_to_le16(0x8006));
+                name = NULL;
+                name_space = NW_NS_DOS;
        }
-        else
 #endif  /* CONFIG_NCPFS_NFS_NS */
-                return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, server->name_space[volnum], cpu_to_le16(0x8006));
+        return ncp_DeleteNSEntry(server, 1, volnum, dirent, name, name_space, cpu_to_le16(0x8006));
 }
 static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
@@ -765,7 +776,7 @@ static inline void ConvertToNWfromDWORD(__u16 v0, __u16 v1, __u8 ret[6])
 /* If both dir and name are NULL, then in target there's already a
   looked-up entry that wants to be opened. */
 int ncp_open_create_file_or_subdir(struct ncp_server *server,
-                                   struct inode *dir, char *name,
+                                   struct inode *dir, const char *name,
                                   int open_create_mode,
                                   __le32 create_attributes,
                                   __le16 desired_acc_rights,
@@ -890,8 +901,8 @@ int ncp_search_for_fileset(struct ncp_server *server,
 static int
 ncp_RenameNSEntry(struct ncp_server *server,
-                  struct inode *old_dir, char *old_name, __le16 old_type,
+                  struct inode *old_dir, const char *old_name, __le16 old_type,
-                  struct inode *new_dir, char *new_name)
+                  struct inode *new_dir, const char *new_name)
 {
        int result = -EINVAL;
@@ -929,8 +940,8 @@ out:
 }
 int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
-                                struct inode *old_dir, char *old_name,
+                                struct inode *old_dir, const char *old_name,
-                                struct inode *new_dir, char *new_name)
+                                struct inode *new_dir, const char *new_name)
 {
        int result;
        __le16 old_type = cpu_to_le16(0x06);
@@ -958,7 +969,7 @@ int
 ncp_read_kernel(struct ncp_server *server, const char *file_id,
             __u32 offset, __u16 to_read, char *target, int *bytes_read)
 {
-        char *source;
+        const char *source;
        int result;
        ncp_init_request(server);
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 2441d1ab57dc..3c57eca634ce 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -65,10 +65,11 @@ static inline void ncp_inode_close(struct inode *inode) {
        atomic_dec(&NCP_FINFO(inode)->opened);
 }
-void ncp_extract_file_info(void* src, struct nw_info_struct* target);
+void ncp_extract_file_info(const void* src, struct nw_info_struct* target);
-int ncp_obtain_info(struct ncp_server *server, struct inode *, char *,
+int ncp_obtain_info(struct ncp_server *server, struct inode *, const char *,
                struct nw_info_struct *target);
 int ncp_obtain_nfs_info(struct ncp_server *server, struct nw_info_struct *target);
+int ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns);
 int ncp_get_volume_root(struct ncp_server *server, const char *volname,
                        __u32 *volume, __le32 *dirent, __le32 *dosdirent);
 int ncp_lookup_volume(struct ncp_server *, const char *, struct nw_info_struct *);
@@ -80,8 +81,8 @@ int ncp_modify_nfs_info(struct ncp_server *, __u8 volnum, __le32 dirent,
                        __u32 mode, __u32 rdev);
 int ncp_del_file_or_subdir2(struct ncp_server *, struct dentry*);
-int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, char *);
+int ncp_del_file_or_subdir(struct ncp_server *, struct inode *, const char *);
-int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, char *,
+int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, const char *,
                                int, __le32, __le16, struct ncp_entry_info *);
 int ncp_initialize_search(struct ncp_server *, struct inode *,
@@ -93,7 +94,7 @@ int ncp_search_for_fileset(struct ncp_server *server,
                           char** rbuf, size_t* rsize);
 int ncp_ren_or_mov_file_or_subdir(struct ncp_server *server,
-                              struct inode *, char *, struct inode *, char *);
+                              struct inode *, const char *, struct inode *, const char *);
 int
@@ -170,13 +171,13 @@ static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
 #endif /* CONFIG_NCPFS_NLS */
 #define NCP_GET_AGE(dentry)     (jiffies - (dentry)->d_time)
-#define NCP_MAX_AGE(server)     ((server)->dentry_ttl)
+#define NCP_MAX_AGE(server)     atomic_read(&(server)->dentry_ttl)
 #define NCP_TEST_AGE(server,dentry)     (NCP_GET_AGE(dentry) < NCP_MAX_AGE(server))
 static inline void
 ncp_age_dentry(struct ncp_server* server, struct dentry* dentry)
 {
-        dentry->d_time = jiffies - server->dentry_ttl;
+        dentry->d_time = jiffies - NCP_MAX_AGE(server);
 }
 static inline void
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index 7c0b5c21e6cf..d8b2d7e6910b 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -15,21 +15,21 @@
 /* i386: 32-bit, little endian, handles mis-alignment */
 #ifdef __i386__
-#define GET_LE32(p) (*(int *)(p))
+#define GET_LE32(p) (*(const int *)(p))
 #define PUT_LE32(p,v) { *(int *)(p)=v; }
 #else
 /* from include/ncplib.h */
-#define BVAL(buf,pos) (((__u8 *)(buf))[pos])
+#define BVAL(buf,pos) (((const __u8 *)(buf))[pos])
 #define PVAL(buf,pos) ((unsigned)BVAL(buf,pos))
-#define BSET(buf,pos,val) (BVAL(buf,pos) = (val))
+#define BSET(buf,pos,val) (((__u8 *)(buf))[pos] = (val))
 static inline __u16
-WVAL_LH(__u8 * buf, int pos)
+WVAL_LH(const __u8 * buf, int pos)
 {
        return PVAL(buf, pos) | PVAL(buf, pos + 1) << 8;
 }
 static inline __u32
-DVAL_LH(__u8 * buf, int pos)
+DVAL_LH(const __u8 * buf, int pos)
 {
        return WVAL_LH(buf, pos) | WVAL_LH(buf, pos + 2) << 16;
 }
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index c7ff6c700a6e..668bd267346e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -746,7 +746,6 @@ static int ncp_do_request(struct ncp_server *server, int size,
                return -EIO;
        }
        if (!ncp_conn_valid(server)) {
-                printk(KERN_ERR "ncpfs: Connection invalid!\n");
                return -EIO;
        }
        {
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index f7e13db613cb..ba306658a6db 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -76,13 +76,17 @@ config NFS_V4
 config NFS_V4_1
        bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
-        depends on NFS_V4 && EXPERIMENTAL
+        depends on NFS_FS && NFS_V4 && EXPERIMENTAL
+        select PNFS_FILE_LAYOUT
        help
          This option enables support for minor version 1 of the NFSv4 protocol
-          (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
+          (RFC 5661) in the kernel's NFS client.
          If unsure, say N.
+config PNFS_FILE_LAYOUT
+        tristate
 config ROOT_NFS
        bool "Root file system on NFS"
        depends on NFS_FS=y && IP_PNP
@@ -117,3 +121,14 @@ config NFS_USE_KERNEL_DNS
        select DNS_RESOLVER
        select KEYS
        default y
+config NFS_USE_NEW_IDMAPPER
+        bool "Use the new idmapper upcall routine"
+        depends on NFS_V4 && KEYS
+        help
+          Say Y here if you want NFS to use the new idmapper upcall functions.
+          You will need /sbin/request-key (usually provided by the keyutils
+          package).  For details, read
+          <file:Documentation/filesystems/nfs/idmapper.txt>.
+          If you are unsure, say N.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index da7fda639eac..4776ff9e3814 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,5 +15,9 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
                           delegation.o idmap.o \
                           callback.o callback_xdr.o callback_proc.o \
                           nfs4namespace.o
+nfs-$(CONFIG_NFS_V4_1)  += pnfs.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e17b49e2eabd..aeec017fe814 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -109,7 +109,7 @@ nfs4_callback_up(struct svc_serv *serv)
 {
        int ret;
-        ret = svc_create_xprt(serv, "tcp", PF_INET,
+        ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
        if (ret <= 0)
                goto out_err;
@@ -117,7 +117,7 @@ nfs4_callback_up(struct svc_serv *serv)
        dprintk("NFS: Callback listener port = %u (af %u)\n",
                        nfs_callback_tcpport, PF_INET);
-        ret = svc_create_xprt(serv, "tcp", PF_INET6,
+        ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
        if (ret > 0) {
                nfs_callback_tcpport6 = ret;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 930d10fecdaf..2950fca0c61b 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -118,11 +118,11 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n
        if (delegation == NULL)
                return 0;
-        /* seqid is 4-bytes long */
+        if (stateid->stateid.seqid != 0)
-        if (((u32 *) &stateid->data)[0] != 0)
                return 0;
-        if (memcmp(&delegation->stateid.data[4], &stateid->data[4],
+        if (memcmp(&delegation->stateid.stateid.other,
-                   sizeof(stateid->data)-4))
+                   &stateid->stateid.other,
+                   NFS4_STATEID_OTHER_SIZE))
                return 0;
        return 1;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e7340729af89..0870d0d4efc0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -48,6 +48,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_CLIENT
@@ -155,7 +156,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        cred = rpc_lookup_machine_cred();
        if (!IS_ERR(cred))
                clp->cl_machine_cred = cred;
+#if defined(CONFIG_NFS_V4_1)
+        INIT_LIST_HEAD(&clp->cl_layouts);
+#endif
        nfs_fscache_get_client_cookie(clp);
        return clp;
@@ -252,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp)
                nfs_free_client(clp);
        }
 }
+EXPORT_SYMBOL_GPL(nfs_put_client);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 /*
@@ -601,6 +605,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 {
        struct rpc_clnt         *clnt = NULL;
        struct rpc_create_args args = {
+                .net            = &init_net,
                .protocol       = clp->cl_proto,
                .address        = (struct sockaddr *)&clp->cl_addr,
                .addrsize       = clp->cl_addrlen,
@@ -635,7 +640,8 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 */
 static void nfs_destroy_server(struct nfs_server *server)
 {
-        if (!(server->flags & NFS_MOUNT_NONLM))
+        if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) ||
+                        !(server->flags & NFS_MOUNT_LOCAL_FCNTL))
                nlmclnt_done(server->nlm_host);
 }
@@ -657,7 +663,8 @@ static int nfs_start_lockd(struct nfs_server *server)
        if (nlm_init.nfs_version > 3)
                return 0;
-        if (server->flags & NFS_MOUNT_NONLM)
+        if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
+                        (server->flags & NFS_MOUNT_LOCAL_FCNTL))
                return 0;
        switch (clp->cl_proto) {
@@ -898,11 +905,13 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
                server->wsize = NFS_MAX_FILE_IO_SIZE;
        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        set_pnfs_layoutdriver(server, fsinfo->layouttype);
        server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
        server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
-        if (server->dtsize > PAGE_CACHE_SIZE)
+        if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
-                server->dtsize = PAGE_CACHE_SIZE;
+                server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
        if (server->dtsize > server->rsize)
                server->dtsize = server->rsize;
@@ -913,6 +922,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
        server->maxfilesize = fsinfo->maxfilesize;
+        server->time_delta = fsinfo->time_delta;
        /* We're airborne Set socket buffersize */
        rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
 }
@@ -935,6 +946,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
        }
        fsinfo.fattr = fattr;
+        fsinfo.layouttype = 0;
        error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
        if (error < 0)
                goto out_error;
@@ -1017,6 +1029,7 @@ void nfs_free_server(struct nfs_server *server)
 {
        dprintk("--> nfs_free_server()\n");
+        unset_pnfs_layoutdriver(server);
        spin_lock(&nfs_client_lock);
        list_del(&server->client_link);
        list_del(&server->master_link);
@@ -1356,8 +1369,9 @@ static int nfs4_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
-        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
+        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK;
-                NFS_CAP_POSIX_LOCK;
+        if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+                        server->caps |= NFS_CAP_READDIRPLUS;
        server->options = data->options;
        /* Get a client record */
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index b9c3c43cea1d..232a7eead33a 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -71,20 +71,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
        if (inode->i_flock == NULL)
                goto out;
-        /* Protect inode->i_flock using the BKL */
+        /* Protect inode->i_flock using the file locks lock */
-        lock_kernel();
+        lock_flocks();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
                if (nfs_file_open_context(fl->fl_file) != ctx)
                        continue;
-                unlock_kernel();
+                unlock_flocks();
                status = nfs4_lock_delegation_recall(state, fl);
                if (status < 0)
                        goto out;
-                lock_kernel();
+                lock_flocks();
        }
-        unlock_kernel();
+        unlock_flocks();
 out:
        return status;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e257172d438c..07ac3847e562 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,11 +33,12 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
+#include <linux/vmalloc.h>
-#include "nfs4_fs.h"
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 /* #define NFS_DEBUG_VERBOSE 1 */
@@ -55,6 +56,7 @@ static int nfs_rename(struct inode *, struct dentry *,
                      struct inode *, struct dentry *);
 static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
+static int nfs_readdir_clear_array(struct page*, gfp_t);
 const struct file_operations nfs_dir_operations = {
        .llseek         = nfs_llseek_dir,
@@ -80,6 +82,10 @@ const struct inode_operations nfs_dir_inode_operations = {
        .setattr        = nfs_setattr,
 };
+const struct address_space_operations nfs_dir_addr_space_ops = {
+        .releasepage = nfs_readdir_clear_array,
+};
 #ifdef CONFIG_NFS_V3
 const struct inode_operations nfs3_dir_inode_operations = {
        .create         = nfs_create,
@@ -104,8 +110,9 @@ const struct inode_operations nfs3_dir_inode_operations = {
 #ifdef CONFIG_NFS_V4
 static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd);
 const struct inode_operations nfs4_dir_inode_operations = {
-        .create         = nfs_create,
+        .create         = nfs_open_create,
        .lookup         = nfs_atomic_lookup,
        .link           = nfs_link,
        .unlink         = nfs_unlink,
@@ -150,51 +157,197 @@ nfs_opendir(struct inode *inode, struct file *filp)
        return res;
 }
-typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int);
+struct nfs_cache_array_entry {
+        u64 cookie;
+        u64 ino;
+        struct qstr string;
+};
+struct nfs_cache_array {
+        unsigned int size;
+        int eof_index;
+        u64 last_cookie;
+        struct nfs_cache_array_entry array[0];
+};
+#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
+typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 typedef struct {
        struct file     *file;
        struct page     *page;
        unsigned long   page_index;
-        __be32          *ptr;
        u64             *dir_cookie;
        loff_t          current_index;
-        struct nfs_entry *entry;
        decode_dirent_t decode;
-        int             plus;
        unsigned long   timestamp;
        unsigned long   gencount;
-        int             timestamp_valid;
+        unsigned int    cache_entry_index;
+        unsigned int    plus:1;
+        unsigned int    eof:1;
 } nfs_readdir_descriptor_t;
-/* Now we cache directories properly, by stuffing the dirent
+/*
- * data directly in the page cache.
+ * The caller is responsible for calling nfs_readdir_release_array(page)
- *
- * Inode invalidation due to refresh etc. takes care of
- * _everything_, no sloppy entry flushing logic, no extraneous
- * copying, network direct to page cache, the way it was meant
- * to be.
- *
- * NOTE: Dirent information verification is done always by the
- *       page-in of the RPC reply, nowhere else, this simplies
- *       things substantially.
 */
 static
-int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
+struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
+{
+        if (page == NULL)
+                return ERR_PTR(-EIO);
+        return (struct nfs_cache_array *)kmap(page);
+}
+static
+void nfs_readdir_release_array(struct page *page)
+{
+        kunmap(page);
+}
+/*
+ * we are freeing strings created by nfs_add_to_readdir_array()
+ */
+static
+int nfs_readdir_clear_array(struct page *page, gfp_t mask)
+{
+        struct nfs_cache_array *array = nfs_readdir_get_array(page);
+        int i;
+        for (i = 0; i < array->size; i++)
+                kfree(array->array[i].string.name);
+        nfs_readdir_release_array(page);
+        return 0;
+}
+/*
+ * the caller is responsible for freeing qstr.name
+ * when called by nfs_readdir_add_to_array, the strings will be freed in
+ * nfs_clear_readdir_array()
+ */
+static
+int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+{
+        string->len = len;
+        string->name = kmemdup(name, len, GFP_KERNEL);
+        if (string->name == NULL)
+                return -ENOMEM;
+        string->hash = full_name_hash(name, len);
+        return 0;
+}
+static
+int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+{
+        struct nfs_cache_array *array = nfs_readdir_get_array(page);
+        struct nfs_cache_array_entry *cache_entry;
+        int ret;
+        if (IS_ERR(array))
+                return PTR_ERR(array);
+        ret = -EIO;
+        if (array->size >= MAX_READDIR_ARRAY)
+                goto out;
+        cache_entry = &array->array[array->size];
+        cache_entry->cookie = entry->prev_cookie;
+        cache_entry->ino = entry->ino;
+        ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
+        if (ret)
+                goto out;
+        array->last_cookie = entry->cookie;
+        if (entry->eof == 1)
+                array->eof_index = array->size;
+        array->size++;
+out:
+        nfs_readdir_release_array(page);
+        return ret;
+}
+static
+int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+        loff_t diff = desc->file->f_pos - desc->current_index;
+        unsigned int index;
+        if (diff < 0)
+                goto out_eof;
+        if (diff >= array->size) {
+                if (array->eof_index > 0)
+                        goto out_eof;
+                desc->current_index += array->size;
+                return -EAGAIN;
+        }
+        index = (unsigned int)diff;
+        *desc->dir_cookie = array->array[index].cookie;
+        desc->cache_entry_index = index;
+        if (index == array->eof_index)
+                desc->eof = 1;
+        return 0;
+out_eof:
+        desc->eof = 1;
+        return -EBADCOOKIE;
+}
+static
+int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+        int i;
+        int status = -EAGAIN;
+        for (i = 0; i < array->size; i++) {
+                if (i == array->eof_index) {
+                        desc->eof = 1;
+                        status = -EBADCOOKIE;
+                }
+                if (array->array[i].cookie == *desc->dir_cookie) {
+                        desc->cache_entry_index = i;
+                        status = 0;
+                        break;
+                }
+        }
+        return status;
+}
+static
+int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
+{
+        struct nfs_cache_array *array;
+        int status = -EBADCOOKIE;
+        if (desc->dir_cookie == NULL)
+                goto out;
+        array = nfs_readdir_get_array(desc->page);
+        if (IS_ERR(array)) {
+                status = PTR_ERR(array);
+                goto out;
+        }
+        if (*desc->dir_cookie == 0)
+                status = nfs_readdir_search_for_pos(array, desc);
+        else
+                status = nfs_readdir_search_for_cookie(array, desc);
+        nfs_readdir_release_array(desc->page);
+out:
+        return status;
+}
+/* Fill a page with xdr information before transferring to the cache page */
+static
+int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
+                        struct nfs_entry *entry, struct file *file, struct inode *inode)
 {
-        struct file     *file = desc->file;
-        struct inode    *inode = file->f_path.dentry->d_inode;
        struct rpc_cred *cred = nfs_file_cred(file);
        unsigned long   timestamp, gencount;
        int             error;
-        dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
-                        __func__, (long long)desc->entry->cookie,
-                        page->index);
 again:
        timestamp = jiffies;
        gencount = nfs_inc_attr_generation_counter();
-        error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page,
+        error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
                                          NFS_SERVER(inode)->dtsize, desc->plus);
        if (error < 0) {
                /* We requested READDIRPLUS, but the server doesn't grok it */
@@ -208,190 +361,292 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
        }
        desc->timestamp = timestamp;
        desc->gencount = gencount;
-        desc->timestamp_valid = 1;
+error:
-        SetPageUptodate(page);
+        return error;
-        /* Ensure consistent page alignment of the data.
-         * Note: assumes we have exclusive access to this mapping either
-         *       through inode->i_mutex or some other mechanism.
-         */
-        if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
-                /* Should never happen */
-                nfs_zap_mapping(inode, inode->i_mapping);
-        }
-        unlock_page(page);
-        return 0;
- error:
-        unlock_page(page);
-        return -EIO;
 }
-static inline
+/* Fill in an entry based on the xdr code stored in desc->page */
-int dir_decode(nfs_readdir_descriptor_t *desc)
+static
+int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream)
 {
-        __be32  *p = desc->ptr;
+        __be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus);
-        p = desc->decode(p, desc->entry, desc->plus);
        if (IS_ERR(p))
                return PTR_ERR(p);
-        desc->ptr = p;
-        if (desc->timestamp_valid) {
+        entry->fattr->time_start = desc->timestamp;
-                desc->entry->fattr->time_start = desc->timestamp;
+        entry->fattr->gencount = desc->gencount;
-                desc->entry->fattr->gencount = desc->gencount;
-        } else
-                desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
        return 0;
 }
-static inline
+static
-void dir_page_release(nfs_readdir_descriptor_t *desc)
+int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
 {
-        kunmap(desc->page);
+        struct nfs_inode *node;
-        page_cache_release(desc->page);
+        if (dentry->d_inode == NULL)
-        desc->page = NULL;
+                goto different;
-        desc->ptr = NULL;
+        node = NFS_I(dentry->d_inode);
+        if (node->fh.size != entry->fh->size)
+                goto different;
+        if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
+                goto different;
+        return 1;
+different:
+        return 0;
 }
-/*
+static
- * Given a pointer to a buffer that has already been filled by a call
+void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
- * to readdir, find the next entry with cookie '*desc->dir_cookie'.
- *
- * If the end of the buffer has been reached, return -EAGAIN, if not,
- * return the offset within the buffer of the next entry to be
- * read.
- */
-static inline
-int find_dirent(nfs_readdir_descriptor_t *desc)
 {
-        struct nfs_entry *entry = desc->entry;
+        struct qstr filename = {
-        int             loop_count = 0,
+                .len = entry->len,
-                        status;
+                .name = entry->name,
+        };
+        struct dentry *dentry;
+        struct dentry *alias;
+        struct inode *dir = parent->d_inode;
+        struct inode *inode;
-        while((status = dir_decode(desc)) == 0) {
+        if (filename.name[0] == '.') {
-                dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n",
+                if (filename.len == 1)
-                                __func__, (unsigned long long)entry->cookie);
+                        return;
-                if (entry->prev_cookie == *desc->dir_cookie)
+                if (filename.len == 2 && filename.name[1] == '.')
-                        break;
+                        return;
-                if (loop_count++ > 200) {
+        }
-                        loop_count = 0;
+        filename.hash = full_name_hash(filename.name, filename.len);
-                        schedule();
+        dentry = d_lookup(parent, &filename);
+        if (dentry != NULL) {
+                if (nfs_same_file(dentry, entry)) {
+                        nfs_refresh_inode(dentry->d_inode, entry->fattr);
+                        goto out;
+                } else {
+                        d_drop(dentry);
+                        dput(dentry);
                }
        }
-        return status;
+        dentry = d_alloc(parent, &filename);
+        if (dentry == NULL)
+                return;
+        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+        if (IS_ERR(inode))
+                goto out;
+        alias = d_materialise_unique(dentry, inode);
+        if (IS_ERR(alias))
+                goto out;
+        else if (alias) {
+                nfs_set_verifier(alias, nfs_save_change_attribute(dir));
+                dput(alias);
+        } else
+                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+out:
+        dput(dentry);
+}
+/* Perform conversion from xdr to cache array */
+static
+void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+                                void *xdr_page, struct page *page, unsigned int buflen)
+{
+        struct xdr_stream stream;
+        struct xdr_buf buf;
+        __be32 *ptr = xdr_page;
+        int status;
+        struct nfs_cache_array *array;
+        buf.head->iov_base = xdr_page;
+        buf.head->iov_len = buflen;
+        buf.tail->iov_len = 0;
+        buf.page_base = 0;
+        buf.page_len = 0;
+        buf.buflen = buf.head->iov_len;
+        buf.len = buf.head->iov_len;
+        xdr_init_decode(&stream, &buf, ptr);
+        do {
+                status = xdr_decode(desc, entry, &stream);
+                if (status != 0)
+                        break;
+                if (nfs_readdir_add_to_array(entry, page) == -1)
+                        break;
+                if (desc->plus == 1)
+                        nfs_prime_dcache(desc->file->f_path.dentry, entry);
+        } while (!entry->eof);
+        if (status == -EBADCOOKIE && entry->eof) {
+                array = nfs_readdir_get_array(page);
+                array->eof_index = array->size - 1;
+                status = 0;
+                nfs_readdir_release_array(page);
+        }
+}
+static
+void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+{
+        unsigned int i;
+        for (i = 0; i < npages; i++)
+                put_page(pages[i]);
+}
+static
+void nfs_readdir_free_large_page(void *ptr, struct page **pages,
+                unsigned int npages)
+{
+        vm_unmap_ram(ptr, npages);
+        nfs_readdir_free_pagearray(pages, npages);
 }
 /*
- * Given a pointer to a buffer that has already been filled by a call
+ * nfs_readdir_large_page will allocate pages that must be freed with a call
- * to readdir, find the entry at offset 'desc->file->f_pos'.
+ * to nfs_readdir_free_large_page
- *
- * If the end of the buffer has been reached, return -EAGAIN, if not,
- * return the offset within the buffer of the next entry to be
- * read.
 */
-static inline
+static
-int find_dirent_index(nfs_readdir_descriptor_t *desc)
+void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
 {
-        struct nfs_entry *entry = desc->entry;
+        void *ptr;
-        int             loop_count = 0,
+        unsigned int i;
-                        status;
+        for (i = 0; i < npages; i++) {
+                struct page *page = alloc_page(GFP_KERNEL);
+                if (page == NULL)
+                        goto out_freepages;
+                pages[i] = page;
+        }
-        for(;;) {
+        ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
-                status = dir_decode(desc);
+        if (!IS_ERR_OR_NULL(ptr))
-                if (status)
+                return ptr;
-                        break;
+out_freepages:
+        nfs_readdir_free_pagearray(pages, i);
+        return NULL;
+}
+static
+int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
+{
+        struct page *pages[NFS_MAX_READDIR_PAGES];
+        void *pages_ptr = NULL;
+        struct nfs_entry entry;
+        struct file     *file = desc->file;
+        struct nfs_cache_array *array;
+        int status = 0;
+        unsigned int array_size = ARRAY_SIZE(pages);
+        entry.prev_cookie = 0;
+        entry.cookie = *desc->dir_cookie;
+        entry.eof = 0;
+        entry.fh = nfs_alloc_fhandle();
+        entry.fattr = nfs_alloc_fattr();
+        if (entry.fh == NULL || entry.fattr == NULL)
+                goto out;
-                dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n",
+        array = nfs_readdir_get_array(page);
-                                (unsigned long long)entry->cookie, desc->current_index);
+        memset(array, 0, sizeof(struct nfs_cache_array));
+        array->eof_index = -1;
-                if (desc->file->f_pos == desc->current_index) {
+        pages_ptr = nfs_readdir_large_page(pages, array_size);
-                        *desc->dir_cookie = entry->cookie;
+        if (!pages_ptr)
+                goto out_release_array;
+        do {
+                status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
+                if (status < 0)
                        break;
-                }
+                nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE);
-                desc->current_index++;
+        } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
-                if (loop_count++ > 200) {
-                        loop_count = 0;
+        nfs_readdir_free_large_page(pages_ptr, pages, array_size);
-                        schedule();
+out_release_array:
-                }
+        nfs_readdir_release_array(page);
-        }
+out:
+        nfs_free_fattr(entry.fattr);
+        nfs_free_fhandle(entry.fh);
        return status;
 }
 /*
- * Find the given page, and call find_dirent() or find_dirent_index in
+ * Now we cache directories properly, by converting xdr information
- * order to try to return the next entry.
+ * to an array that can be used for lookups later.  This results in
+ * fewer cache pages, since we can store more information on each page.
+ * We only need to convert from xdr once so future lookups are much simpler
 */
-static inline
+static
-int find_dirent_page(nfs_readdir_descriptor_t *desc)
+int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
 {
        struct inode    *inode = desc->file->f_path.dentry->d_inode;
-        struct page     *page;
-        int             status;
-        dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n",
+        if (nfs_readdir_xdr_to_array(desc, page, inode) < 0)
-                        __func__, desc->page_index,
+                goto error;
-                        (long long) *desc->dir_cookie);
+        SetPageUptodate(page);
-        /* If we find the page in the page_cache, we cannot be sure
+        if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
-         * how fresh the data is, so we will ignore readdir_plus attributes.
+                /* Should never happen */
-         */
+                nfs_zap_mapping(inode, inode->i_mapping);
-        desc->timestamp_valid = 0;
-        page = read_cache_page(inode->i_mapping, desc->page_index,
-                               (filler_t *)nfs_readdir_filler, desc);
-        if (IS_ERR(page)) {
-                status = PTR_ERR(page);
-                goto out;
        }
+        unlock_page(page);
+        return 0;
+ error:
+        unlock_page(page);
+        return -EIO;
+}
-        /* NOTE: Someone else may have changed the READDIRPLUS flag */
+static
-        desc->page = page;
+void cache_page_release(nfs_readdir_descriptor_t *desc)
-        desc->ptr = kmap(page);         /* matching kunmap in nfs_do_filldir */
+{
-        if (*desc->dir_cookie != 0)
+        page_cache_release(desc->page);
-                status = find_dirent(desc);
+        desc->page = NULL;
-        else
+}
-                status = find_dirent_index(desc);
-        if (status < 0)
+static
-                dir_page_release(desc);
+struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
- out:
+{
-        dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
+        struct page *page;
-        return status;
+        page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+                        desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+        if (IS_ERR(page))
+                desc->eof = 1;
+        return page;
 }
 /*
- * Recurse through the page cache pages, and return a
+ * Returns 0 if desc->dir_cookie was found on page desc->page_index
- * filled nfs_entry structure of the next directory entry if possible.
- *
- * The target for the search is '*desc->dir_cookie' if non-0,
- * 'desc->file->f_pos' otherwise
 */
+static
+int find_cache_page(nfs_readdir_descriptor_t *desc)
+{
+        int res;
+        desc->page = get_cache_page(desc);
+        if (IS_ERR(desc->page))
+                return PTR_ERR(desc->page);
+        res = nfs_readdir_search_array(desc);
+        if (res == 0)
+                return 0;
+        cache_page_release(desc);
+        return res;
+}
+/* Search for desc->dir_cookie from the beginning of the page cache */
 static inline
 int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 {
-        int             loop_count = 0;
+        int res = -EAGAIN;
-        int             res;
-        /* Always search-by-index from the beginning of the cache */
-        if (*desc->dir_cookie == 0) {
-                dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n",
-                                (long long)desc->file->f_pos);
-                desc->page_index = 0;
-                desc->entry->cookie = desc->entry->prev_cookie = 0;
-                desc->entry->eof = 0;
-                desc->current_index = 0;
-        } else
-                dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n",
-                                (unsigned long long)*desc->dir_cookie);
-        for (;;) {
+        while (1) {
-                res = find_dirent_page(desc);
+                res = find_cache_page(desc);
                if (res != -EAGAIN)
                        break;
-                /* Align to beginning of next page */
+                desc->page_index++;
-                desc->page_index ++;
-                if (loop_count++ > 200) {
-                        loop_count = 0;
-                        schedule();
-                }
        }
-        dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res);
        return res;
 }
@@ -400,8 +655,6 @@ static inline unsigned int dt_type(struct inode *inode)
        return (inode->i_mode >> 12) & 15;
 }
-static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc);
 /*
 * Once we've found the start of the dirent within a page: fill 'er up...
 */
@@ -410,49 +663,36 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
                   filldir_t filldir)
 {
        struct file     *file = desc->file;
-        struct nfs_entry *entry = desc->entry;
+        int i = 0;
-        struct dentry   *dentry = NULL;
+        int res = 0;
-        u64             fileid;
+        struct nfs_cache_array *array = NULL;
-        int             loop_count = 0,
+        unsigned int d_type = DT_UNKNOWN;
-                        res;
+        struct dentry *dentry = NULL;
-        dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n",
-                        (unsigned long long)entry->cookie);
-        for(;;) {
-                unsigned d_type = DT_UNKNOWN;
-                /* Note: entry->prev_cookie contains the cookie for
-                 *       retrieving the current dirent on the server */
-                fileid = entry->ino;
-                /* Get a dentry if we have one */
-                if (dentry != NULL)
-                        dput(dentry);
-                dentry = nfs_readdir_lookup(desc);
-                /* Use readdirplus info */
+        array = nfs_readdir_get_array(desc->page);
-                if (dentry != NULL && dentry->d_inode != NULL) {
-                        d_type = dt_type(dentry->d_inode);
-                        fileid = NFS_FILEID(dentry->d_inode);
-                }
-                res = filldir(dirent, entry->name, entry->len, 
+        for (i = desc->cache_entry_index; i < array->size; i++) {
-                              file->f_pos, nfs_compat_user_ino64(fileid),
+                d_type = DT_UNKNOWN;
-                              d_type);
+                res = filldir(dirent, array->array[i].string.name,
+                        array->array[i].string.len, file->f_pos,
+                        nfs_compat_user_ino64(array->array[i].ino), d_type);
                if (res < 0)
                        break;
                file->f_pos++;
-                *desc->dir_cookie = entry->cookie;
+                desc->cache_entry_index = i;
-                if (dir_decode(desc) != 0) {
+                if (i < (array->size-1))
-                        desc->page_index ++;
+                        *desc->dir_cookie = array->array[i+1].cookie;
+                else
+                        *desc->dir_cookie = array->last_cookie;
+                if (i == array->eof_index) {
+                        desc->eof = 1;
                        break;
                }
-                if (loop_count++ > 200) {
-                        loop_count = 0;
-                        schedule();
-                }
        }
-        dir_page_release(desc);
+        nfs_readdir_release_array(desc->page);
+        cache_page_release(desc);
        if (dentry != NULL)
                dput(dentry);
        dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
@@ -476,12 +716,9 @@ static inline
 int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
                     filldir_t filldir)
 {
-        struct file     *file = desc->file;
-        struct inode    *inode = file->f_path.dentry->d_inode;
-        struct rpc_cred *cred = nfs_file_cred(file);
        struct page     *page = NULL;
        int             status;
-        unsigned long   timestamp, gencount;
+        struct inode *inode = desc->file->f_path.dentry->d_inode;
        dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
                        (unsigned long long)*desc->dir_cookie);
@@ -491,38 +728,22 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
                status = -ENOMEM;
                goto out;
        }
-        timestamp = jiffies;
-        gencount = nfs_inc_attr_generation_counter();
+        if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
-        status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
-                                                *desc->dir_cookie, page,
-                                                NFS_SERVER(inode)->dtsize,
-                                                desc->plus);
-        desc->page = page;
-        desc->ptr = kmap(page);         /* matching kunmap in nfs_do_filldir */
-        if (status >= 0) {
-                desc->timestamp = timestamp;
-                desc->gencount = gencount;
-                desc->timestamp_valid = 1;
-                if ((status = dir_decode(desc)) == 0)
-                        desc->entry->prev_cookie = *desc->dir_cookie;
-        } else
                status = -EIO;
-        if (status < 0)
                goto out_release;
+        }
+        desc->page_index = 0;
+        desc->page = page;
        status = nfs_do_filldir(desc, dirent, filldir);
-        /* Reset read descriptor so it searches the page cache from
-         * the start upon the next call to readdir_search_pagecache() */
-        desc->page_index = 0;
-        desc->entry->cookie = desc->entry->prev_cookie = 0;
-        desc->entry->eof = 0;
 out:
        dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
                        __func__, status);
        return status;
 out_release:
-        dir_page_release(desc);
+        cache_page_release(desc);
        goto out;
 }
@@ -536,7 +757,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct inode    *inode = dentry->d_inode;
        nfs_readdir_descriptor_t my_desc,
                        *desc = &my_desc;
-        struct nfs_entry my_entry;
        int res = -ENOMEM;
        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
@@ -557,26 +777,17 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        desc->decode = NFS_PROTO(inode)->decode_dirent;
        desc->plus = NFS_USE_READDIRPLUS(inode);
-        my_entry.cookie = my_entry.prev_cookie = 0;
-        my_entry.eof = 0;
-        my_entry.fh = nfs_alloc_fhandle();
-        my_entry.fattr = nfs_alloc_fattr();
-        if (my_entry.fh == NULL || my_entry.fattr == NULL)
-                goto out_alloc_failed;
-        desc->entry = &my_entry;
        nfs_block_sillyrename(dentry);
        res = nfs_revalidate_mapping(inode, filp->f_mapping);
        if (res < 0)
                goto out;
-        while(!desc->entry->eof) {
+        while (desc->eof != 1) {
                res = readdir_search_pagecache(desc);
                if (res == -EBADCOOKIE) {
                        /* This means either end of directory */
-                        if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) {
+                        if (*desc->dir_cookie && desc->eof == 0) {
                                /* Or that the server has 'lost' a cookie */
                                res = uncached_readdir(desc, dirent, filldir);
                                if (res >= 0)
@@ -588,8 +799,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                if (res == -ETOOSMALL && desc->plus) {
                        clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        nfs_zap_caches(inode);
+                        desc->page_index = 0;
                        desc->plus = 0;
-                        desc->entry->eof = 0;
+                        desc->eof = 0;
                        continue;
                }
                if (res < 0)
@@ -605,9 +817,6 @@ out:
        nfs_unblock_sillyrename(dentry);
        if (res > 0)
                res = 0;
-out_alloc_failed:
-        nfs_free_fattr(my_entry.fattr);
-        nfs_free_fhandle(my_entry.fh);
        dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        res);
@@ -1029,10 +1238,63 @@ static int is_atomic_open(struct nameidata *nd)
        return 1;
 }
+static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd)
+{
+        struct path path = {
+                .mnt = nd->path.mnt,
+                .dentry = dentry,
+        };
+        struct nfs_open_context *ctx;
+        struct rpc_cred *cred;
+        fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+        cred = rpc_lookup_cred();
+        if (IS_ERR(cred))
+                return ERR_CAST(cred);
+        ctx = alloc_nfs_open_context(&path, cred, fmode);
+        put_rpccred(cred);
+        if (ctx == NULL)
+                return ERR_PTR(-ENOMEM);
+        return ctx;
+}
+static int do_open(struct inode *inode, struct file *filp)
+{
+        nfs_fscache_set_inode_cookie(inode, filp);
+        return 0;
+}
+static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx)
+{
+        struct file *filp;
+        int ret = 0;
+        /* If the open_intent is for execute, we have an extra check to make */
+        if (ctx->mode & FMODE_EXEC) {
+                ret = nfs_may_open(ctx->path.dentry->d_inode,
+                                ctx->cred,
+                                nd->intent.open.flags);
+                if (ret < 0)
+                        goto out;
+        }
+        filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open);
+        if (IS_ERR(filp))
+                ret = PTR_ERR(filp);
+        else
+                nfs_file_set_open_context(filp, ctx);
+out:
+        put_nfs_open_context(ctx);
+        return ret;
+}
 static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
+        struct nfs_open_context *ctx;
+        struct iattr attr;
        struct dentry *res = NULL;
-        int error;
+        struct inode *inode;
+        int open_flags;
+        int err;
        dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1054,13 +1316,32 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                goto out;
        }
+        ctx = nameidata_to_nfs_open_context(dentry, nd);
+        res = ERR_CAST(ctx);
+        if (IS_ERR(ctx))
+                goto out;
+        open_flags = nd->intent.open.flags;
+        if (nd->flags & LOOKUP_CREATE) {
+                attr.ia_mode = nd->intent.open.create_mode;
+                attr.ia_valid = ATTR_MODE;
+                if (!IS_POSIXACL(dir))
+                        attr.ia_mode &= ~current_umask();
+        } else {
+                open_flags &= ~(O_EXCL | O_CREAT);
+                attr.ia_valid = 0;
+        }
        /* Open the file on the server */
-        res = nfs4_atomic_open(dir, dentry, nd);
+        nfs_block_sillyrename(dentry->d_parent);
-        if (IS_ERR(res)) {
+        inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
-                error = PTR_ERR(res);
+        if (IS_ERR(inode)) {
-                switch (error) {
+                nfs_unblock_sillyrename(dentry->d_parent);
+                put_nfs_open_context(ctx);
+                switch (PTR_ERR(inode)) {
                        /* Make a negative dentry */
                        case -ENOENT:
+                                d_add(dentry, NULL);
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
@@ -1072,11 +1353,25 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                        goto no_open;
                        /* case -EINVAL: */
                        default:
+                                res = ERR_CAST(inode);
                                goto out;
                }
-        } else if (res != NULL)
+        }
+        res = d_add_unique(dentry, inode);
+        nfs_unblock_sillyrename(dentry->d_parent);
+        if (res != NULL) {
+                dput(ctx->path.dentry);
+                ctx->path.dentry = dget(res);
                dentry = res;
+        }
+        err = nfs_intent_set_file(nd, ctx);
+        if (err < 0) {
+                if (res != NULL)
+                        dput(res);
+                return ERR_PTR(err);
+        }
 out:
+        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        return res;
 no_open:
        return nfs_lookup(dir, dentry, nd);
@@ -1087,12 +1382,15 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        struct dentry *parent = NULL;
        struct inode *inode = dentry->d_inode;
        struct inode *dir;
+        struct nfs_open_context *ctx;
        int openflags, ret = 0;
        if (!is_atomic_open(nd) || d_mountpoint(dentry))
                goto no_open;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
        /* We can't create new files in nfs_open_revalidate(), so we
         * optimize away revalidation of negative dentries.
         */
@@ -1112,99 +1410,96 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        /* We can't create new files, or truncate existing ones here */
        openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
+        ctx = nameidata_to_nfs_open_context(dentry, nd);
+        ret = PTR_ERR(ctx);
+        if (IS_ERR(ctx))
+                goto out;
        /*
         * Note: we're not holding inode->i_mutex and so may be racing with
         * operations that change the directory. We therefore save the
         * change attribute *before* we do the RPC call.
         */
-        ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
+        inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
+        if (IS_ERR(inode)) {
+                ret = PTR_ERR(inode);
+                switch (ret) {
+                case -EPERM:
+                case -EACCES:
+                case -EDQUOT:
+                case -ENOSPC:
+                case -EROFS:
+                        goto out_put_ctx;
+                default:
+                        goto out_drop;
+                }
+        }
+        iput(inode);
+        if (inode != dentry->d_inode)
+                goto out_drop;
+        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+        ret = nfs_intent_set_file(nd, ctx);
+        if (ret >= 0)
+                ret = 1;
 out:
        dput(parent);
-        if (!ret)
-                d_drop(dentry);
        return ret;
+out_drop:
+        d_drop(dentry);
+        ret = 0;
+out_put_ctx:
+        put_nfs_open_context(ctx);
+        goto out;
 no_open_dput:
        dput(parent);
 no_open:
        return nfs_lookup_revalidate(dentry, nd);
 }
-#endif /* CONFIG_NFSV4 */
-static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
+                struct nameidata *nd)
 {
-        struct dentry *parent = desc->file->f_path.dentry;
+        struct nfs_open_context *ctx = NULL;
-        struct inode *dir = parent->d_inode;
+        struct iattr attr;
-        struct nfs_entry *entry = desc->entry;
+        int error;
-        struct dentry *dentry, *alias;
+        int open_flags = 0;
-        struct qstr name = {
-                .name = entry->name,
-                .len = entry->len,
-        };
-        struct inode *inode;
-        unsigned long verf = nfs_save_change_attribute(dir);
-        switch (name.len) {
+        dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
-                case 2:
+                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
-                        if (name.name[0] == '.' && name.name[1] == '.')
-                                return dget_parent(parent);
-                        break;
-                case 1:
-                        if (name.name[0] == '.')
-                                return dget(parent);
-        }
-        spin_lock(&dir->i_lock);
+        attr.ia_mode = mode;
-        if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) {
+        attr.ia_valid = ATTR_MODE;
-                spin_unlock(&dir->i_lock);
-                return NULL;
-        }
-        spin_unlock(&dir->i_lock);
-        name.hash = full_name_hash(name.name, name.len);
+        if ((nd->flags & LOOKUP_CREATE) != 0) {
-        dentry = d_lookup(parent, &name);
+                open_flags = nd->intent.open.flags;
-        if (dentry != NULL) {
-                /* Is this a positive dentry that matches the readdir info? */
-                if (dentry->d_inode != NULL &&
-                                (NFS_FILEID(dentry->d_inode) == entry->ino ||
-                                d_mountpoint(dentry))) {
-                        if (!desc->plus || entry->fh->size == 0)
-                                return dentry;
-                        if (nfs_compare_fh(NFS_FH(dentry->d_inode),
-                                                entry->fh) == 0)
-                                goto out_renew;
-                }
-                /* No, so d_drop to allow one to be created */
-                d_drop(dentry);
-                dput(dentry);
-        }
-        if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
-                return NULL;
-        if (name.len > NFS_SERVER(dir)->namelen)
-                return NULL;
-        /* Note: caller is already holding the dir->i_mutex! */
-        dentry = d_alloc(parent, &name);
-        if (dentry == NULL)
-                return NULL;
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
-        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
-        if (IS_ERR(inode)) {
-                dput(dentry);
-                return NULL;
-        }
-        alias = d_materialise_unique(dentry, inode);
+                ctx = nameidata_to_nfs_open_context(dentry, nd);
-        if (alias != NULL) {
+                error = PTR_ERR(ctx);
-                dput(dentry);
+                if (IS_ERR(ctx))
-                if (IS_ERR(alias))
+                        goto out_err_drop;
-                        return NULL;
-                dentry = alias;
        }
-out_renew:
+        error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
-        nfs_set_verifier(dentry, verf);
+        if (error != 0)
-        return dentry;
+                goto out_put_ctx;
+        if (ctx != NULL) {
+                error = nfs_intent_set_file(nd, ctx);
+                if (error < 0)
+                        goto out_err;
+        }
+        return 0;
+out_put_ctx:
+        if (ctx != NULL)
+                put_nfs_open_context(ctx);
+out_err_drop:
+        d_drop(dentry);
+out_err:
+        return error;
 }
+#endif /* CONFIG_NFSV4 */
 /*
 * Code common to create, mkdir, and mknod.
 */
@@ -1258,7 +1553,6 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 {
        struct iattr attr;
        int error;
-        int open_flags = 0;
        dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1266,10 +1560,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
        attr.ia_mode = mode;
        attr.ia_valid = ATTR_MODE;
-        if ((nd->flags & LOOKUP_CREATE) != 0)
+        error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
-                open_flags = nd->intent.open.flags;
-        error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
        if (error != 0)
                goto out_err;
        return 0;
@@ -1351,76 +1642,6 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
        return error;
 }
-static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
-{
-        static unsigned int sillycounter;
-        const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
-        const int      countersize = sizeof(sillycounter)*2;
-        const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
-        char           silly[slen+1];
-        struct qstr    qsilly;
-        struct dentry *sdentry;
-        int            error = -EIO;
-        dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
-                dentry->d_parent->d_name.name, dentry->d_name.name, 
-                atomic_read(&dentry->d_count));
-        nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
-        /*
-         * We don't allow a dentry to be silly-renamed twice.
-         */
-        error = -EBUSY;
-        if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
-                goto out;
-        sprintf(silly, ".nfs%*.*Lx",
-                fileidsize, fileidsize,
-                (unsigned long long)NFS_FILEID(dentry->d_inode));
-        /* Return delegation in anticipation of the rename */
-        nfs_inode_return_delegation(dentry->d_inode);
-        sdentry = NULL;
-        do {
-                char *suffix = silly + slen - countersize;
-                dput(sdentry);
-                sillycounter++;
-                sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
-                dfprintk(VFS, "NFS: trying to rename %s to %s\n",
-                                dentry->d_name.name, silly);
-                
-                sdentry = lookup_one_len(silly, dentry->d_parent, slen);
-                /*
-                 * N.B. Better to return EBUSY here ... it could be
-                 * dangerous to delete the file while it's in use.
-                 */
-                if (IS_ERR(sdentry))
-                        goto out;
-        } while(sdentry->d_inode != NULL); /* need negative lookup */
-        qsilly.name = silly;
-        qsilly.len  = strlen(silly);
-        if (dentry->d_inode) {
-                error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
-                                dir, &qsilly);
-                nfs_mark_for_revalidate(dentry->d_inode);
-        } else
-                error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
-                                dir, &qsilly);
-        if (!error) {
-                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-                d_move(dentry, sdentry);
-                error = nfs_async_unlink(dir, dentry);
-                /* If we return 0 we don't unlink */
-        }
-        dput(sdentry);
-out:
-        return error;
-}
 /*
 * Remove a file after making sure there are no pending writes,
 * and after checking that the file has only one user. 
@@ -1580,7 +1801,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
        d_drop(dentry);
        error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
        if (error == 0) {
-                atomic_inc(&inode->i_count);
+                ihold(inode);
                d_add(dentry, inode);
        }
        return error;
@@ -1711,14 +1932,14 @@ static void nfs_access_free_list(struct list_head *head)
 int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
        LIST_HEAD(head);
-        struct nfs_inode *nfsi;
+        struct nfs_inode *nfsi, *next;
        struct nfs_access_entry *cache;
        if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
                return (nr_to_scan == 0) ? 0 : -1;
        spin_lock(&nfs_access_lru_lock);
-        list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
+        list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
                struct inode *inode;
                if (nr_to_scan-- == 0)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 064a80961677..84d3c8b90206 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -873,7 +873,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
        dreq->inode = inode;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
-        if (dreq->l_ctx != NULL)
+        if (dreq->l_ctx == NULL)
                goto out_release;
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index dba50a5625db..a6e711ad130f 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -167,7 +167,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
                return 0;
        }
        item = container_of(h, struct nfs_dns_ent, h);
-        ttl = (long)item->h.expiry_time - (long)get_seconds();
+        ttl = item->h.expiry_time - seconds_since_boot();
        if (ttl < 0)
                ttl = 0;
@@ -239,7 +239,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
        ttl = get_expiry(&buf);
        if (ttl == 0)
                goto out;
-        key.h.expiry_time = ttl + get_seconds();
+        key.h.expiry_time = ttl + seconds_since_boot();
        ret = -ENOMEM;
        item = nfs_dns_lookup(cd, &key);
@@ -301,7 +301,7 @@ static int do_cache_lookup_nowait(struct cache_detail *cd,
                goto out_err;
        ret = -ETIMEDOUT;
        if (!test_bit(CACHE_VALID, &(*item)->h.flags)
-                        || (*item)->h.expiry_time < get_seconds()
+                        || (*item)->h.expiry_time < seconds_since_boot()
                        || cd->flush_time > (*item)->h.last_refresh)
                goto out_put;
        ret = -ENOENT;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05bf3c0dc751..60677f9f1311 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_FILE
@@ -386,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                file->f_path.dentry->d_name.name,
                mapping->host->i_ino, len, (long long) pos);
+        pnfs_update_layout(mapping->host,
+                           nfs_file_open_context(file),
+                           IOMODE_RW);
 start:
        /*
         * Prevent starvation issues if someone is doing a consistency
@@ -551,7 +556,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct file *filp = vma->vm_file;
        struct dentry *dentry = filp->f_path.dentry;
        unsigned pagelen;
-        int ret = -EINVAL;
+        int ret = VM_FAULT_NOPAGE;
        struct address_space *mapping;
        dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
@@ -567,21 +572,20 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (mapping != dentry->d_inode->i_mapping)
                goto out_unlock;
-        ret = 0;
        pagelen = nfs_page_length(page);
        if (pagelen == 0)
                goto out_unlock;
-        ret = nfs_flush_incompatible(filp, page);
+        ret = VM_FAULT_LOCKED;
-        if (ret != 0)
+        if (nfs_flush_incompatible(filp, page) == 0 &&
-                goto out_unlock;
+            nfs_updatepage(filp, page, 0, pagelen) == 0)
+                goto out;
-        ret = nfs_updatepage(filp, page, 0, pagelen);
+        ret = VM_FAULT_SIGBUS;
 out_unlock:
-        if (!ret)
-                return VM_FAULT_LOCKED;
        unlock_page(page);
-        return VM_FAULT_SIGBUS;
+out:
+        return ret;
 }
 static const struct vm_operations_struct nfs_file_vm_ops = {
@@ -684,7 +688,8 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
        return ret;
 }
-static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
        struct inode *inode = filp->f_mapping->host;
        int status = 0;
@@ -699,7 +704,7 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
        if (nfs_have_delegation(inode, FMODE_READ))
                goto out_noconflict;
-        if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)
+        if (is_local)
                goto out_noconflict;
        status = NFS_PROTO(inode)->lock(filp, cmd, fl);
@@ -726,7 +731,8 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
        return res;
 }
-static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
        struct inode *inode = filp->f_mapping->host;
        int status;
@@ -741,15 +747,24 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
         *      If we're signalled while cleaning up locks on process exit, we
         *      still need to complete the unlock.
         */
-        /* Use local locking if mounted with "-onolock" */
+        /*
-        if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
+         * Use local locking if mounted with "-onolock" or with appropriate
+         * "-olocal_lock="
+         */
+        if (!is_local)
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
                status = do_vfs_lock(filp, fl);
        return status;
 }
-static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
+static int
+is_time_granular(struct timespec *ts) {
+        return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
+}
+static int
+do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
        struct inode *inode = filp->f_mapping->host;
        int status;
@@ -762,20 +777,31 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
        if (status != 0)
                goto out;
-        /* Use local locking if mounted with "-onolock" */
+        /*
-        if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
+         * Use local locking if mounted with "-onolock" or with appropriate
+         * "-olocal_lock="
+         */
+        if (!is_local)
                status = NFS_PROTO(inode)->lock(filp, cmd, fl);
        else
                status = do_vfs_lock(filp, fl);
        if (status < 0)
                goto out;
        /*
-         * Make sure we clear the cache whenever we try to get the lock.
+         * Revalidate the cache if the server has time stamps granular
+         * enough to detect subsecond changes.  Otherwise, clear the
+         * cache to prevent missing any changes.
+         *
         * This makes locking act as a cache coherency point.
         */
        nfs_sync_mapping(filp->f_mapping);
-        if (!nfs_have_delegation(inode, FMODE_READ))
+        if (!nfs_have_delegation(inode, FMODE_READ)) {
-                nfs_zap_caches(inode);
+                if (is_time_granular(&NFS_SERVER(inode)->time_delta))
+                        __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+                else
+                        nfs_zap_caches(inode);
+        }
 out:
        return status;
 }
@@ -787,6 +813,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
        struct inode *inode = filp->f_mapping->host;
        int ret = -ENOLCK;
+        int is_local = 0;
        dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
                        filp->f_path.dentry->d_parent->d_name.name,
@@ -800,6 +827,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
                goto out_err;
+        if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
+                is_local = 1;
        if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
                ret = NFS_PROTO(inode)->lock_check_bounds(fl);
                if (ret < 0)
@@ -807,11 +837,11 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
        }
        if (IS_GETLK(cmd))
-                ret = do_getlk(filp, cmd, fl);
+                ret = do_getlk(filp, cmd, fl, is_local);
        else if (fl->fl_type == F_UNLCK)
-                ret = do_unlk(filp, cmd, fl);
+                ret = do_unlk(filp, cmd, fl, is_local);
        else
-                ret = do_setlk(filp, cmd, fl);
+                ret = do_setlk(filp, cmd, fl, is_local);
 out_err:
        return ret;
 }
@@ -821,6 +851,9 @@ out_err:
 */
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
+        struct inode *inode = filp->f_mapping->host;
+        int is_local = 0;
        dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
                        filp->f_path.dentry->d_parent->d_name.name,
                        filp->f_path.dentry->d_name.name,
@@ -829,14 +862,17 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
        if (!(fl->fl_flags & FL_FLOCK))
                return -ENOLCK;
+        if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
+                is_local = 1;
        /* We're simulating flock() locks using posix locks on the server */
        fl->fl_owner = (fl_owner_t)filp;
        fl->fl_start = 0;
        fl->fl_end = OFFSET_MAX;
        if (fl->fl_type == F_UNLCK)
-                return do_unlk(filp, cmd, fl);
+                return do_unlk(filp, cmd, fl, is_local);
-        return do_setlk(filp, cmd, fl);
+        return do_setlk(filp, cmd, fl, is_local);
 }
 /*
@@ -848,6 +884,5 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
        dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
                        file->f_path.dentry->d_parent->d_name.name,
                        file->f_path.dentry->d_name.name, arg);
        return -EINVAL;
 }
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a70e446e1605..ac7b814ce162 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -54,8 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
                        iput(inode);
                        return -ENOMEM;
                }
-                /* Circumvent igrab(): we know the inode is not being freed */
+                ihold(inode);
-                atomic_inc(&inode->i_count);
                /*
                 * Ensure that this dentry is invisible to d_find_alias().
                 * Otherwise, it may be spliced into the tree by
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 21a84d45916f..4e2d9b6b1380 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -34,6 +34,212 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
+#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/nfs_idmap.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <keys/user-type.h>
+#define NFS_UINT_MAXLEN 11
+const struct cred *id_resolver_cache;
+struct key_type key_type_id_resolver = {
+        .name           = "id_resolver",
+        .instantiate    = user_instantiate,
+        .match          = user_match,
+        .revoke         = user_revoke,
+        .destroy        = user_destroy,
+        .describe       = user_describe,
+        .read           = user_read,
+};
+int nfs_idmap_init(void)
+{
+        struct cred *cred;
+        struct key *keyring;
+        int ret = 0;
+        printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
+        cred = prepare_kernel_cred(NULL);
+        if (!cred)
+                return -ENOMEM;
+        keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
+                             (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                             KEY_USR_VIEW | KEY_USR_READ,
+                             KEY_ALLOC_NOT_IN_QUOTA);
+        if (IS_ERR(keyring)) {
+                ret = PTR_ERR(keyring);
+                goto failed_put_cred;
+        }
+        ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+        if (ret < 0)
+                goto failed_put_key;
+        ret = register_key_type(&key_type_id_resolver);
+        if (ret < 0)
+                goto failed_put_key;
+        cred->thread_keyring = keyring;
+        cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+        id_resolver_cache = cred;
+        return 0;
+failed_put_key:
+        key_put(keyring);
+failed_put_cred:
+        put_cred(cred);
+        return ret;
+}
+void nfs_idmap_quit(void)
+{
+        key_revoke(id_resolver_cache->thread_keyring);
+        unregister_key_type(&key_type_id_resolver);
+        put_cred(id_resolver_cache);
+}
+/*
+ * Assemble the description to pass to request_key()
+ * This function will allocate a new string and update dest to point
+ * at it.  The caller is responsible for freeing dest.
+ *
+ * On error 0 is returned.  Otherwise, the length of dest is returned.
+ */
+static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
+                                const char *type, size_t typelen, char **desc)
+{
+        char *cp;
+        size_t desclen = typelen + namelen + 2;
+        *desc = kmalloc(desclen, GFP_KERNEL);
+        if (!*desc)
+                return -ENOMEM;
+        cp = *desc;
+        memcpy(cp, type, typelen);
+        cp += typelen;
+        *cp++ = ':';
+        memcpy(cp, name, namelen);
+        cp += namelen;
+        *cp = '\0';
+        return desclen;
+}
+static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
+                const char *type, void *data, size_t data_size)
+{
+        const struct cred *saved_cred;
+        struct key *rkey;
+        char *desc;
+        struct user_key_payload *payload;
+        ssize_t ret;
+        ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
+        if (ret <= 0)
+                goto out;
+        saved_cred = override_creds(id_resolver_cache);
+        rkey = request_key(&key_type_id_resolver, desc, "");
+        revert_creds(saved_cred);
+        kfree(desc);
+        if (IS_ERR(rkey)) {
+                ret = PTR_ERR(rkey);
+                goto out;
+        }
+        rcu_read_lock();
+        rkey->perm |= KEY_USR_VIEW;
+        ret = key_validate(rkey);
+        if (ret < 0)
+                goto out_up;
+        payload = rcu_dereference(rkey->payload.data);
+        if (IS_ERR_OR_NULL(payload)) {
+                ret = PTR_ERR(payload);
+                goto out_up;
+        }
+        ret = payload->datalen;
+        if (ret > 0 && ret <= data_size)
+                memcpy(data, payload->data, ret);
+        else
+                ret = -EINVAL;
+out_up:
+        rcu_read_unlock();
+        key_put(rkey);
+out:
+        return ret;
+}
+/* ID -> Name */
+static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
+{
+        char id_str[NFS_UINT_MAXLEN];
+        int id_len;
+        ssize_t ret;
+        id_len = snprintf(id_str, sizeof(id_str), "%u", id);
+        ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
+        if (ret < 0)
+                return -EINVAL;
+        return ret;
+}
+/* Name -> ID */
+static int nfs_idmap_lookup_id(const char *name, size_t namelen,
+                                const char *type, __u32 *id)
+{
+        char id_str[NFS_UINT_MAXLEN];
+        long id_long;
+        ssize_t data_size;
+        int ret = 0;
+        data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
+        if (data_size <= 0) {
+                ret = -EINVAL;
+        } else {
+                ret = strict_strtol(id_str, 10, &id_long);
+                *id = (__u32)id_long;
+        }
+        return ret;
+}
+int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+{
+        return nfs_idmap_lookup_id(name, namelen, "uid", uid);
+}
+int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
+{
+        return nfs_idmap_lookup_id(name, namelen, "gid", gid);
+}
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+{
+        return nfs_idmap_lookup_name(uid, "user", buf, buflen);
+}
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
+{
+        return nfs_idmap_lookup_name(gid, "group", buf, buflen);
+}
+#else  /* CONFIG_NFS_USE_IDMAPPER not defined */
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
@@ -503,16 +709,17 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele
        return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf)
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
        struct idmap *idmap = clp->cl_idmap;
        return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf)
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
        struct idmap *idmap = clp->cl_idmap;
        return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
 }
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7d2d6c72aa78..314f57164602 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,6 +48,7 @@
 #include "internal.h"
 #include "fscache.h"
 #include "dns_resolve.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
@@ -234,9 +235,6 @@ nfs_init_locked(struct inode *inode, void *opaque)
        return 0;
 }
-/* Don't use READDIRPLUS on directories that we believe are too large */
-#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE)
 /*
 * This is our front-end to iget that looks up inodes by file handle
 * instead of inode number.
@@ -291,8 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                } else if (S_ISDIR(inode->i_mode)) {
                        inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
                        inode->i_fop = &nfs_dir_operations;
-                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
+                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
-                            && fattr->size <= NFS_LIMIT_READDIRPLUS)
                                set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        /* Deal with crossing mountpoints */
                        if ((fattr->valid & NFS_ATTR_FATTR_FSID)
@@ -623,7 +620,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
        nfs_revalidate_inode(server, inode);
 }
-static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred)
+struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode)
 {
        struct nfs_open_context *ctx;
@@ -633,11 +630,13 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
                path_get(&ctx->path);
                ctx->cred = get_rpccred(cred);
                ctx->state = NULL;
+                ctx->mode = f_mode;
                ctx->flags = 0;
                ctx->error = 0;
                ctx->dir_cookie = 0;
                nfs_init_lock_context(&ctx->lock_context);
                ctx->lock_context.open_context = ctx;
+                INIT_LIST_HEAD(&ctx->list);
        }
        return ctx;
 }
@@ -653,11 +652,15 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
        struct inode *inode = ctx->path.dentry->d_inode;
-        if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+        if (!list_empty(&ctx->list)) {
+                if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+                        return;
+                list_del(&ctx->list);
+                spin_unlock(&inode->i_lock);
+        } else if (!atomic_dec_and_test(&ctx->lock_context.count))
                return;
-        list_del(&ctx->list);
+        if (inode != NULL)
-        spin_unlock(&inode->i_lock);
+                NFS_PROTO(inode)->close_context(ctx, is_sync);
-        NFS_PROTO(inode)->close_context(ctx, is_sync);
        if (ctx->cred != NULL)
                put_rpccred(ctx->cred);
        path_put(&ctx->path);
@@ -673,7 +676,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
 * Ensure that mmap has a recent RPC credential for use when writing out
 * shared pages
 */
-static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -730,11 +733,10 @@ int nfs_open(struct inode *inode, struct file *filp)
        cred = rpc_lookup_cred();
        if (IS_ERR(cred))
                return PTR_ERR(cred);
-        ctx = alloc_nfs_open_context(&filp->f_path, cred);
+        ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode);
        put_rpccred(cred);
        if (ctx == NULL)
                return -ENOMEM;
-        ctx->mode = filp->f_mode;
        nfs_file_set_open_context(filp, ctx);
        put_nfs_open_context(ctx);
        nfs_fscache_set_inode_cookie(inode, filp);
@@ -1409,6 +1411,7 @@ void nfs4_evict_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
+        pnfs_destroy_layout(NFS_I(inode));
        /* If we are holding a delegation, return it! */
        nfs_inode_return_delegation_noreclaim(inode);
        /* First call standard NFS clear_inode() code */
@@ -1446,6 +1449,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
        nfsi->delegation = NULL;
        nfsi->delegation_state = 0;
        init_rwsem(&nfsi->rwsem);
+        nfsi->layout = NULL;
 #endif
 }
@@ -1493,7 +1497,7 @@ static int nfsiod_start(void)
 {
        struct workqueue_struct *wq;
        dprintk("RPC:       creating workqueue nfsiod\n");
-        wq = create_singlethread_workqueue("nfsiod");
+        wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
        if (wq == NULL)
                return -ENOMEM;
        nfsiod_workqueue = wq;
@@ -1521,6 +1525,10 @@ static int __init init_nfs_fs(void)
 {
        int err;
+        err = nfs_idmap_init();
+        if (err < 0)
+                goto out9;
        err = nfs_dns_resolver_init();
        if (err < 0)
                goto out8;
@@ -1585,6 +1593,8 @@ out6:
 out7:
        nfs_dns_resolver_destroy();
 out8:
+        nfs_idmap_quit();
+out9:
        return err;
 }
@@ -1597,6 +1607,7 @@ static void __exit exit_nfs_fs(void)
        nfs_destroy_nfspagecache();
        nfs_fscache_unregister();
        nfs_dns_resolver_destroy();
+        nfs_idmap_quit();
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c961bc92c107..db08ff3ff454 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,12 @@ struct nfs_clone_mount {
 #define NFS_UNSPEC_PORT         (-1)
 /*
+ * Maximum number of pages that readdir can use for creating
+ * a vmapped array of pages.
+ */
+#define NFS_MAX_READDIR_PAGES 8
+/*
 * In-kernel mount arguments
 */
 struct nfs_parsed_mount_data {
@@ -181,15 +187,15 @@ extern void nfs_destroy_directcache(void);
 /* nfs2xdr.c */
 extern int nfs_stat_to_errno(int);
 extern struct rpc_procinfo nfs_procedures[];
-extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
+extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
+extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 #endif
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 59047f8d7d72..eceafe74f473 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -153,6 +153,7 @@ int nfs_mount(struct nfs_mount_request *info)
                .rpc_resp       = &result,
        };
        struct rpc_create_args args = {
+                .net            = &init_net,
                .protocol       = info->protocol,
                .address        = info->sap,
                .addrsize       = info->salen,
@@ -224,6 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info)
                .to_retries = 2,
        };
        struct rpc_create_args args = {
+                .net            = &init_net,
                .protocol       = IPPROTO_UDP,
                .address        = info->sap,
                .addrsize       = info->salen,
@@ -436,7 +438,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
        for (i = 0; i < entries; i++) {
                flavors[i] = ntohl(*p++);
-                dprintk("NFS:\tflavor %u: %d\n", i, flavors[i]);
+                dprintk("NFS:   auth flavor[%u]: %d\n", i, flavors[i]);
        }
        *count = i;
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index db8846a0e82e..e6bf45710cc7 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -337,10 +337,10 @@ nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
 static int
 nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        p = xdr_encode_fhandle(p, args->old_dir);
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
+        p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
-        p = xdr_encode_fhandle(p, args->tofh);
+        p = xdr_encode_fhandle(p, args->new_dir);
-        p = xdr_encode_array(p, args->toname, args->tolen);
+        p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
        return 0;
 }
@@ -423,9 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
        struct page **page;
        size_t hdrlen;
        unsigned int pglen, recvd;
-        u32 len;
        int status, nr = 0;
-        __be32 *end, *entry, *kaddr;
        if ((status = ntohl(*p++)))
                return nfs_stat_to_errno(status);
@@ -445,80 +443,59 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
        if (pglen > recvd)
                pglen = recvd;
        page = rcvbuf->pages;
-        kaddr = p = kmap_atomic(*page, KM_USER0);
-        end = (__be32 *)((char *)p + pglen);
-        entry = p;
-        /* Make sure the packet actually has a value_follows and EOF entry */
-        if ((entry + 1) > end)
-                goto short_pkt;
-        for (; *p++; nr++) {
-                if (p + 2 > end)
-                        goto short_pkt;
-                p++; /* fileid */
-                len = ntohl(*p++);
-                p += XDR_QUADLEN(len) + 1;      /* name plus cookie */
-                if (len > NFS2_MAXNAMLEN) {
-                        dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
-                                                len);
-                        goto err_unmap;
-                }
-                if (p + 2 > end)
-                        goto short_pkt;
-                entry = p;
-        }
-        /*
-         * Apparently some server sends responses that are a valid size, but
-         * contain no entries, and have value_follows==0 and EOF==0. For
-         * those, just set the EOF marker.
-         */
-        if (!nr && entry[1] == 0) {
-                dprintk("NFS: readdir reply truncated!\n");
-                entry[1] = 1;
-        }
- out:
-        kunmap_atomic(kaddr, KM_USER0);
        return nr;
- short_pkt:
+}
-        /*
-         * When we get a short packet there are 2 possibilities. We can
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-         * return an error, or fix up the response to look like a valid
+{
-         * response and return what we have so far. If there are no
+        dprintk("nfs: %s: prematurely hit end of receive buffer. "
-         * entries and the packet was short, then return -EIO. If there
+                "Remaining buffer length is %tu words.\n",
-         * are valid entries in the response, return them and pretend that
+                func, xdr->end - xdr->p);
-         * the call was successful, but incomplete. The caller can retry the
-         * readdir starting at the last cookie.
-         */
-        entry[0] = entry[1] = 0;
-        if (!nr)
-                nr = -errno_NFSERR_IO;
-        goto out;
-err_unmap:
-        nr = -errno_NFSERR_IO;
-        goto out;
 }
 __be32 *
-nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
 {
-        if (!*p++) {
+        __be32 *p;
-                if (!*p)
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        if (!ntohl(*p++)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                if (!ntohl(*p++))
                        return ERR_PTR(-EAGAIN);
                entry->eof = 1;
                return ERR_PTR(-EBADCOOKIE);
        }
+        p = xdr_inline_decode(xdr, 8);
+        if (unlikely(!p))
+                goto out_overflow;
        entry->ino        = ntohl(*p++);
        entry->len        = ntohl(*p++);
+        p = xdr_inline_decode(xdr, entry->len + 4);
+        if (unlikely(!p))
+                goto out_overflow;
        entry->name       = (const char *) p;
        p                += XDR_QUADLEN(entry->len);
        entry->prev_cookie        = entry->cookie;
        entry->cookie     = ntohl(*p++);
-        entry->eof        = !p[0] && p[1];
+        p = xdr_inline_peek(xdr, 8);
+        if (p != NULL)
+                entry->eof = !p[0] && p[1];
+        else
+                entry->eof = 0;
        return p;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return ERR_PTR(-EIO);
 }
 /*
@@ -596,7 +573,6 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
        struct kvec *iov = rcvbuf->head;
        size_t hdrlen;
        u32 len, recvd;
-        char    *kaddr;
        int     status;
        if ((status = ntohl(*p++)))
@@ -623,10 +599,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
                return -EIO;
        }
-        /* NULL terminate the string we got */
+        xdr_terminate_string(rcvbuf, len);
-        kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0);
-        kaddr[len+rcvbuf->page_base] = '\0';
-        kunmap_atomic(kaddr, KM_USER0);
        return 0;
 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index fabb4f2849a1..ce939c062a52 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -313,7 +313,7 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
 */
 static int
 nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-                 int flags, struct nameidata *nd)
+                 int flags, struct nfs_open_context *ctx)
 {
        struct nfs3_createdata *data;
        mode_t mode = sattr->ia_mode;
@@ -438,19 +438,38 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        return 1;
 }
+static void
+nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+        msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
+}
+static int
+nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+                      struct inode *new_dir)
+{
+        struct nfs_renameres *res;
+        if (nfs3_async_handle_jukebox(task, old_dir))
+                return 0;
+        res = task->tk_msg.rpc_resp;
+        nfs_post_op_update_inode(old_dir, res->old_fattr);
+        nfs_post_op_update_inode(new_dir, res->new_fattr);
+        return 1;
+}
 static int
 nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
                 struct inode *new_dir, struct qstr *new_name)
 {
-        struct nfs3_renameargs  arg = {
+        struct nfs_renameargs   arg = {
-                .fromfh         = NFS_FH(old_dir),
+                .old_dir        = NFS_FH(old_dir),
-                .fromname       = old_name->name,
+                .old_name       = old_name,
-                .fromlen        = old_name->len,
+                .new_dir        = NFS_FH(new_dir),
-                .tofh           = NFS_FH(new_dir),
+                .new_name       = new_name,
-                .toname         = new_name->name,
-                .tolen          = new_name->len
        };
-        struct nfs3_renameres res;
+        struct nfs_renameres res;
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_RENAME],
                .rpc_argp       = &arg,
@@ -460,17 +479,17 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
        dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
-        res.fromattr = nfs_alloc_fattr();
+        res.old_fattr = nfs_alloc_fattr();
-        res.toattr = nfs_alloc_fattr();
+        res.new_fattr = nfs_alloc_fattr();
-        if (res.fromattr == NULL || res.toattr == NULL)
+        if (res.old_fattr == NULL || res.new_fattr == NULL)
                goto out;
        status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
-        nfs_post_op_update_inode(old_dir, res.fromattr);
+        nfs_post_op_update_inode(old_dir, res.old_fattr);
-        nfs_post_op_update_inode(new_dir, res.toattr);
+        nfs_post_op_update_inode(new_dir, res.new_fattr);
 out:
-        nfs_free_fattr(res.toattr);
+        nfs_free_fattr(res.old_fattr);
-        nfs_free_fattr(res.fromattr);
+        nfs_free_fattr(res.new_fattr);
        dprintk("NFS reply rename: %d\n", status);
        return status;
 }
@@ -611,7 +630,7 @@ out:
 */
 static int
 nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                  u64 cookie, struct page *page, unsigned int count, int plus)
+                  u64 cookie, struct page **pages, unsigned int count, int plus)
 {
        struct inode            *dir = dentry->d_inode;
        __be32                  *verf = NFS_COOKIEVERF(dir);
@@ -621,7 +640,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                .verf           = {verf[0], verf[1]},
                .plus           = plus,
                .count          = count,
-                .pages          = &page
+                .pages          = pages
        };
        struct nfs3_readdirres  res = {
                .verf           = verf,
@@ -652,7 +671,8 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
        nfs_free_fattr(res.dir_attr);
 out:
-        dprintk("NFS reply readdir: %d\n", status);
+        dprintk("NFS reply readdir%s: %d\n",
+                        plus? "plus" : "", status);
        return status;
 }
@@ -722,7 +742,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
        dprintk("NFS call  fsstat\n");
        nfs_fattr_init(stat->fattr);
        status = rpc_call_sync(server->client, &msg, 0);
-        dprintk("NFS reply statfs: %d\n", status);
+        dprintk("NFS reply fsstat: %d\n", status);
        return status;
 }
@@ -844,6 +864,8 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .unlink_setup   = nfs3_proc_unlink_setup,
        .unlink_done    = nfs3_proc_unlink_done,
        .rename         = nfs3_proc_rename,
+        .rename_setup   = nfs3_proc_rename_setup,
+        .rename_done    = nfs3_proc_rename_done,
        .link           = nfs3_proc_link,
        .symlink        = nfs3_proc_symlink,
        .mkdir          = nfs3_proc_mkdir,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9769704f8ce6..d9a5e832c257 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -100,6 +100,13 @@ static const umode_t nfs_type2fmt[] = {
        [NF3FIFO] = S_IFIFO,
 };
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+        dprintk("nfs: %s: prematurely hit end of receive buffer. "
+                "Remaining buffer length is %tu words.\n",
+                func, xdr->end - xdr->p);
+}
 /*
 * Common NFS XDR functions as inlines
 */
@@ -119,6 +126,29 @@ xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
        return NULL;
 }
+static inline __be32 *
+xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        fh->size = ntohl(*p++);
+        if (fh->size <= NFS3_FHSIZE) {
+                p = xdr_inline_decode(xdr, fh->size);
+                if (unlikely(!p))
+                        goto out_overflow;
+                memcpy(fh->data, p, fh->size);
+                return p + XDR_QUADLEN(fh->size);
+        }
+        return NULL;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return ERR_PTR(-EIO);
+}
 /*
 * Encode/decode time.
 */
@@ -241,6 +271,26 @@ xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
 }
 static inline __be32 *
+xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        if (ntohl(*p++)) {
+                p = xdr_inline_decode(xdr, 84);
+                if (unlikely(!p))
+                        goto out_overflow;
+                p = xdr_decode_fattr(p, fattr);
+        }
+        return p;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return ERR_PTR(-EIO);
+}
+static inline __be32 *
 xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
 {
        if (*p++)
@@ -442,12 +492,12 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
 * Encode RENAME arguments
 */
 static int
-nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args)
+nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
 {
-        p = xdr_encode_fhandle(p, args->fromfh);
+        p = xdr_encode_fhandle(p, args->old_dir);
-        p = xdr_encode_array(p, args->fromname, args->fromlen);
+        p = xdr_encode_array(p, args->old_name->name, args->old_name->len);
-        p = xdr_encode_fhandle(p, args->tofh);
+        p = xdr_encode_fhandle(p, args->new_dir);
-        p = xdr_encode_array(p, args->toname, args->tolen);
+        p = xdr_encode_array(p, args->new_name->name, args->new_name->len);
        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
        return 0;
 }
@@ -504,9 +554,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
        struct kvec *iov = rcvbuf->head;
        struct page **page;
        size_t hdrlen;
-        u32 len, recvd, pglen;
+        u32 recvd, pglen;
        int status, nr = 0;
-        __be32 *entry, *end, *kaddr;
        status = ntohl(*p++);
        /* Decode post_op_attrs */
@@ -536,99 +585,38 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
        if (pglen > recvd)
                pglen = recvd;
        page = rcvbuf->pages;
-        kaddr = p = kmap_atomic(*page, KM_USER0);
-        end = (__be32 *)((char *)p + pglen);
-        entry = p;
-        /* Make sure the packet actually has a value_follows and EOF entry */
-        if ((entry + 1) > end)
-                goto short_pkt;
-        for (; *p++; nr++) {
-                if (p + 3 > end)
-                        goto short_pkt;
-                p += 2;                         /* inode # */
-                len = ntohl(*p++);              /* string length */
-                p += XDR_QUADLEN(len) + 2;      /* name + cookie */
-                if (len > NFS3_MAXNAMLEN) {
-                        dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
-                                                len);
-                        goto err_unmap;
-                }
-                if (res->plus) {
-                        /* post_op_attr */
-                        if (p + 2 > end)
-                                goto short_pkt;
-                        if (*p++) {
-                                p += 21;
-                                if (p + 1 > end)
-                                        goto short_pkt;
-                        }
-                        /* post_op_fh3 */
-                        if (*p++) {
-                                if (p + 1 > end)
-                                        goto short_pkt;
-                                len = ntohl(*p++);
-                                if (len > NFS3_FHSIZE) {
-                                        dprintk("NFS: giant filehandle in "
-                                                "readdir (len 0x%x)!\n", len);
-                                        goto err_unmap;
-                                }
-                                p += XDR_QUADLEN(len);
-                        }
-                }
-                if (p + 2 > end)
-                        goto short_pkt;
-                entry = p;
-        }
-        /*
-         * Apparently some server sends responses that are a valid size, but
-         * contain no entries, and have value_follows==0 and EOF==0. For
-         * those, just set the EOF marker.
-         */
-        if (!nr && entry[1] == 0) {
-                dprintk("NFS: readdir reply truncated!\n");
-                entry[1] = 1;
-        }
- out:
-        kunmap_atomic(kaddr, KM_USER0);
        return nr;
- short_pkt:
-        /*
-         * When we get a short packet there are 2 possibilities. We can
-         * return an error, or fix up the response to look like a valid
-         * response and return what we have so far. If there are no
-         * entries and the packet was short, then return -EIO. If there
-         * are valid entries in the response, return them and pretend that
-         * the call was successful, but incomplete. The caller can retry the
-         * readdir starting at the last cookie.
-         */
-        entry[0] = entry[1] = 0;
-        if (!nr)
-                nr = -errno_NFSERR_IO;
-        goto out;
-err_unmap:
-        nr = -errno_NFSERR_IO;
-        goto out;
 }
 __be32 *
-nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus)
 {
+        __be32 *p;
        struct nfs_entry old = *entry;
-        if (!*p++) {
+        p = xdr_inline_decode(xdr, 4);
-                if (!*p)
+        if (unlikely(!p))
+                goto out_overflow;
+        if (!ntohl(*p++)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                if (!ntohl(*p++))
                        return ERR_PTR(-EAGAIN);
                entry->eof = 1;
                return ERR_PTR(-EBADCOOKIE);
        }
+        p = xdr_inline_decode(xdr, 12);
+        if (unlikely(!p))
+                goto out_overflow;
        p = xdr_decode_hyper(p, &entry->ino);
        entry->len  = ntohl(*p++);
+        p = xdr_inline_decode(xdr, entry->len + 8);
+        if (unlikely(!p))
+                goto out_overflow;
        entry->name = (const char *) p;
        p += XDR_QUADLEN(entry->len);
        entry->prev_cookie = entry->cookie;
@@ -636,10 +624,17 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
        if (plus) {
                entry->fattr->valid = 0;
-                p = xdr_decode_post_op_attr(p, entry->fattr);
+                p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
+                if (IS_ERR(p))
+                        goto out_overflow_exit;
                /* In fact, a post_op_fh3: */
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
                if (*p++) {
-                        p = xdr_decode_fhandle(p, entry->fh);
+                        p = xdr_decode_fhandle_stream(xdr, entry->fh);
+                        if (IS_ERR(p))
+                                goto out_overflow_exit;
                        /* Ugh -- server reply was truncated */
                        if (p == NULL) {
                                dprintk("NFS: FH truncated\n");
@@ -650,8 +645,18 @@ nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
                        memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
        }
-        entry->eof = !p[0] && p[1];
+        p = xdr_inline_peek(xdr, 8);
+        if (p != NULL)
+                entry->eof = !p[0] && p[1];
+        else
+                entry->eof = 0;
        return p;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+out_overflow_exit:
+        return ERR_PTR(-EIO);
 }
 /*
@@ -824,7 +829,6 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
        struct kvec *iov = rcvbuf->head;
        size_t hdrlen;
        u32 len, recvd;
-        char    *kaddr;
        int     status;
        status = ntohl(*p++);
@@ -857,10 +861,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
                return -EIO;
        }
-        /* NULL terminate the string we got */
+        xdr_terminate_string(rcvbuf, len);
-        kaddr = (char*)kmap_atomic(rcvbuf->pages[0], KM_USER0);
-        kaddr[len+rcvbuf->page_base] = '\0';
-        kunmap_atomic(kaddr, KM_USER0);
        return 0;
 }
@@ -970,14 +971,14 @@ nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
 * Decode RENAME reply
 */
 static int
-nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res)
+nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res)
 {
        int     status;
        if ((status = ntohl(*p++)) != 0)
                status = nfs_stat_to_errno(status);
-        p = xdr_decode_wcc_data(p, res->fromattr);
+        p = xdr_decode_wcc_data(p, res->old_fattr);
-        p = xdr_decode_wcc_data(p, res->toattr);
+        p = xdr_decode_wcc_data(p, res->new_fattr);
        return status;
 }
@@ -1043,8 +1044,9 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
        res->wtmult = ntohl(*p++);
        res->dtpref = ntohl(*p++);
        p = xdr_decode_hyper(p, &res->maxfilesize);
+        p = xdr_decode_time3(p, &res->time_delta);
-        /* ignore time_delta and properties */
+        /* ignore properties */
        res->lease_time = 0;
        return 0;
 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 311e15cc8af0..9fa496387fdf 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -242,8 +242,6 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
-extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
-extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
@@ -333,7 +331,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 /* nfs4xdr.c */
-extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 extern struct rpc_procinfo nfs4_procedures[];
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
new file mode 100644
index 000000000000..2e92f0d8d654
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.c
@@ -0,0 +1,280 @@
+/*
+ *  Module for the pnfs nfs4 file layout driver.
+ *  Defines all I/O and Policy interface operations, plus code
+ *  to register itself with the pNFS client.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "nfs4filelayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
+MODULE_DESCRIPTION("The NFSv4 file layout driver");
+static int
+filelayout_set_layoutdriver(struct nfs_server *nfss)
+{
+        int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
+                                                nfs4_fl_free_deviceid_callback);
+        if (status) {
+                printk(KERN_WARNING "%s: deviceid cache could not be "
+                        "initialized\n", __func__);
+                return status;
+        }
+        dprintk("%s: deviceid cache has been initialized successfully\n",
+                __func__);
+        return 0;
+}
+/* Clear out the layout by destroying its device list */
+static int
+filelayout_clear_layoutdriver(struct nfs_server *nfss)
+{
+        dprintk("--> %s\n", __func__);
+        if (nfss->nfs_client->cl_devid_cache)
+                pnfs_put_deviceid_cache(nfss->nfs_client);
+        return 0;
+}
+/*
+ * filelayout_check_layout()
+ *
+ * Make sure layout segment parameters are sane WRT the device.
+ * At this point no generic layer initialization of the lseg has occurred,
+ * and nothing has been added to the layout_hdr cache.
+ *
+ */
+static int
+filelayout_check_layout(struct pnfs_layout_hdr *lo,
+                        struct nfs4_filelayout_segment *fl,
+                        struct nfs4_layoutget_res *lgr,
+                        struct nfs4_deviceid *id)
+{
+        struct nfs4_file_layout_dsaddr *dsaddr;
+        int status = -EINVAL;
+        struct nfs_server *nfss = NFS_SERVER(lo->inode);
+        dprintk("--> %s\n", __func__);
+        if (fl->pattern_offset > lgr->range.offset) {
+                dprintk("%s pattern_offset %lld to large\n",
+                                __func__, fl->pattern_offset);
+                goto out;
+        }
+        if (fl->stripe_unit % PAGE_SIZE) {
+                dprintk("%s Stripe unit (%u) not page aligned\n",
+                        __func__, fl->stripe_unit);
+                goto out;
+        }
+        /* find and reference the deviceid */
+        dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+        if (dsaddr == NULL) {
+                dsaddr = get_device_info(lo->inode, id);
+                if (dsaddr == NULL)
+                        goto out;
+        }
+        fl->dsaddr = dsaddr;
+        if (fl->first_stripe_index < 0 ||
+            fl->first_stripe_index >= dsaddr->stripe_count) {
+                dprintk("%s Bad first_stripe_index %d\n",
+                                __func__, fl->first_stripe_index);
+                goto out_put;
+        }
+        if ((fl->stripe_type == STRIPE_SPARSE &&
+            fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
+            (fl->stripe_type == STRIPE_DENSE &&
+            fl->num_fh != dsaddr->stripe_count)) {
+                dprintk("%s num_fh %u not valid for given packing\n",
+                        __func__, fl->num_fh);
+                goto out_put;
+        }
+        if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
+                dprintk("%s Stripe unit (%u) not aligned with rsize %u "
+                        "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
+                        nfss->wsize);
+        }
+        status = 0;
+out:
+        dprintk("--> %s returns %d\n", __func__, status);
+        return status;
+out_put:
+        pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+        goto out;
+}
+static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
+{
+        int i;
+        for (i = 0; i < fl->num_fh; i++) {
+                if (!fl->fh_array[i])
+                        break;
+                kfree(fl->fh_array[i]);
+        }
+        kfree(fl->fh_array);
+        fl->fh_array = NULL;
+}
+static void
+_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
+{
+        filelayout_free_fh_array(fl);
+        kfree(fl);
+}
+static int
+filelayout_decode_layout(struct pnfs_layout_hdr *flo,
+                         struct nfs4_filelayout_segment *fl,
+                         struct nfs4_layoutget_res *lgr,
+                         struct nfs4_deviceid *id)
+{
+        uint32_t *p = (uint32_t *)lgr->layout.buf;
+        uint32_t nfl_util;
+        int i;
+        dprintk("%s: set_layout_map Begin\n", __func__);
+        memcpy(id, p, sizeof(*id));
+        p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+        print_deviceid(id);
+        nfl_util = be32_to_cpup(p++);
+        if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
+                fl->commit_through_mds = 1;
+        if (nfl_util & NFL4_UFLG_DENSE)
+                fl->stripe_type = STRIPE_DENSE;
+        else
+                fl->stripe_type = STRIPE_SPARSE;
+        fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
+        fl->first_stripe_index = be32_to_cpup(p++);
+        p = xdr_decode_hyper(p, &fl->pattern_offset);
+        fl->num_fh = be32_to_cpup(p++);
+        dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
+                __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
+                fl->pattern_offset);
+        fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
+                               GFP_KERNEL);
+        if (!fl->fh_array)
+                return -ENOMEM;
+        for (i = 0; i < fl->num_fh; i++) {
+                /* Do we want to use a mempool here? */
+                fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
+                if (!fl->fh_array[i]) {
+                        filelayout_free_fh_array(fl);
+                        return -ENOMEM;
+                }
+                fl->fh_array[i]->size = be32_to_cpup(p++);
+                if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
+                        printk(KERN_ERR "Too big fh %d received %d\n",
+                               i, fl->fh_array[i]->size);
+                        filelayout_free_fh_array(fl);
+                        return -EIO;
+                }
+                memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
+                p += XDR_QUADLEN(fl->fh_array[i]->size);
+                dprintk("DEBUG: %s: fh len %d\n", __func__,
+                        fl->fh_array[i]->size);
+        }
+        return 0;
+}
+static struct pnfs_layout_segment *
+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
+                      struct nfs4_layoutget_res *lgr)
+{
+        struct nfs4_filelayout_segment *fl;
+        int rc;
+        struct nfs4_deviceid id;
+        dprintk("--> %s\n", __func__);
+        fl = kzalloc(sizeof(*fl), GFP_KERNEL);
+        if (!fl)
+                return NULL;
+        rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
+        if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
+                _filelayout_free_lseg(fl);
+                return NULL;
+        }
+        return &fl->generic_hdr;
+}
+static void
+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+        struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+        dprintk("--> %s\n", __func__);
+        pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
+                          &fl->dsaddr->deviceid);
+        _filelayout_free_lseg(fl);
+}
+static struct pnfs_layoutdriver_type filelayout_type = {
+        .id = LAYOUT_NFSV4_1_FILES,
+        .name = "LAYOUT_NFSV4_1_FILES",
+        .owner = THIS_MODULE,
+        .set_layoutdriver = filelayout_set_layoutdriver,
+        .clear_layoutdriver = filelayout_clear_layoutdriver,
+        .alloc_lseg              = filelayout_alloc_lseg,
+        .free_lseg               = filelayout_free_lseg,
+};
+static int __init nfs4filelayout_init(void)
+{
+        printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
+               __func__);
+        return pnfs_register_layoutdriver(&filelayout_type);
+}
+static void __exit nfs4filelayout_exit(void)
+{
+        printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
+               __func__);
+        pnfs_unregister_layoutdriver(&filelayout_type);
+}
+module_init(nfs4filelayout_init);
+module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
new file mode 100644
index 000000000000..bbf60dd2ab9d
--- /dev/null
+++ b/fs/nfs/nfs4filelayout.h
@@ -0,0 +1,94 @@
+/*
+ *  NFSv4 file layout driver data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#ifndef FS_NFS_NFS4FILELAYOUT_H
+#define FS_NFS_NFS4FILELAYOUT_H
+#include "pnfs.h"
+/*
+ * Field testing shows we need to support upto 4096 stripe indices.
+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
+ * reasonable. This in turn means we support a maximum of 256
+ * RFC 5661 multipath_list4 structures.
+ */
+#define NFS4_PNFS_MAX_STRIPE_CNT 4096
+#define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */
+enum stripetype4 {
+        STRIPE_SPARSE = 1,
+        STRIPE_DENSE = 2
+};
+/* Individual ip address */
+struct nfs4_pnfs_ds {
+        struct list_head        ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+        u32                     ds_ip_addr;
+        u32                     ds_port;
+        struct nfs_client       *ds_clp;
+        atomic_t                ds_count;
+};
+struct nfs4_file_layout_dsaddr {
+        struct pnfs_deviceid_node       deviceid;
+        u32                             stripe_count;
+        u8                              *stripe_indices;
+        u32                             ds_num;
+        struct nfs4_pnfs_ds             *ds_list[1];
+};
+struct nfs4_filelayout_segment {
+        struct pnfs_layout_segment generic_hdr;
+        u32 stripe_type;
+        u32 commit_through_mds;
+        u32 stripe_unit;
+        u32 first_stripe_index;
+        u64 pattern_offset;
+        struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
+        unsigned int num_fh;
+        struct nfs_fh **fh_array;
+};
+static inline struct nfs4_filelayout_segment *
+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+        return container_of(lseg,
+                            struct nfs4_filelayout_segment,
+                            generic_hdr);
+}
+extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
+extern void print_ds(struct nfs4_pnfs_ds *ds);
+extern void print_deviceid(struct nfs4_deviceid *dev_id);
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
+#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
new file mode 100644
index 000000000000..51fe64ace55a
--- /dev/null
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -0,0 +1,448 @@
+/*
+ *  Device operations for the pnfs nfs4 file layout driver.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+#include "nfs4filelayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ *   - set to 1 on allocation
+ *   - incremented when a device id maps a data server already in the cache.
+ *   - decremented when deviceid is removed from the cache.
+ */
+DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+/* Debug routines */
+void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+        if (ds == NULL) {
+                printk("%s NULL device\n", __func__);
+                return;
+        }
+        printk("        ip_addr %x port %hu\n"
+                "        ref count %d\n"
+                "        client %p\n"
+                "        cl_exchange_flags %x\n",
+                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+                atomic_read(&ds->ds_count), ds->ds_clp,
+                ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+void
+print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+        int i;
+        ifdebug(FACILITY) {
+                printk("%s dsaddr->ds_num %d\n", __func__,
+                       dsaddr->ds_num);
+                for (i = 0; i < dsaddr->ds_num; i++)
+                        print_ds(dsaddr->ds_list[i]);
+        }
+}
+void print_deviceid(struct nfs4_deviceid *id)
+{
+        u32 *p = (u32 *)id;
+        dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+                p[0], p[1], p[2], p[3]);
+}
+/* nfs4_ds_cache_lock is held */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(u32 ip_addr, u32 port)
+{
+        struct nfs4_pnfs_ds *ds;
+        dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
+                        ntohl(ip_addr), ntohs(port));
+        list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
+                if (ds->ds_ip_addr == ip_addr &&
+                    ds->ds_port == port) {
+                        return ds;
+                }
+        }
+        return NULL;
+}
+static void
+destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+        dprintk("--> %s\n", __func__);
+        ifdebug(FACILITY)
+                print_ds(ds);
+        if (ds->ds_clp)
+                nfs_put_client(ds->ds_clp);
+        kfree(ds);
+}
+static void
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+        struct nfs4_pnfs_ds *ds;
+        int i;
+        print_deviceid(&dsaddr->deviceid.de_id);
+        for (i = 0; i < dsaddr->ds_num; i++) {
+                ds = dsaddr->ds_list[i];
+                if (ds != NULL) {
+                        if (atomic_dec_and_lock(&ds->ds_count,
+                                                &nfs4_ds_cache_lock)) {
+                                list_del_init(&ds->ds_node);
+                                spin_unlock(&nfs4_ds_cache_lock);
+                                destroy_ds(ds);
+                        }
+                }
+        }
+        kfree(dsaddr->stripe_indices);
+        kfree(dsaddr);
+}
+void
+nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
+{
+        struct nfs4_file_layout_dsaddr *dsaddr =
+                container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
+        nfs4_fl_free_deviceid(dsaddr);
+}
+static struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
+{
+        struct nfs4_pnfs_ds *tmp_ds, *ds;
+        ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
+        if (!ds)
+                goto out;
+        spin_lock(&nfs4_ds_cache_lock);
+        tmp_ds = _data_server_lookup_locked(ip_addr, port);
+        if (tmp_ds == NULL) {
+                ds->ds_ip_addr = ip_addr;
+                ds->ds_port = port;
+                atomic_set(&ds->ds_count, 1);
+                INIT_LIST_HEAD(&ds->ds_node);
+                ds->ds_clp = NULL;
+                list_add(&ds->ds_node, &nfs4_data_server_cache);
+                dprintk("%s add new data server ip 0x%x\n", __func__,
+                        ds->ds_ip_addr);
+        } else {
+                kfree(ds);
+                atomic_inc(&tmp_ds->ds_count);
+                dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
+                        __func__, tmp_ds->ds_ip_addr,
+                        atomic_read(&tmp_ds->ds_count));
+                ds = tmp_ds;
+        }
+        spin_unlock(&nfs4_ds_cache_lock);
+out:
+        return ds;
+}
+/*
+ * Currently only support ipv4, and one multi-path address.
+ */
+static struct nfs4_pnfs_ds *
+decode_and_add_ds(__be32 **pp, struct inode *inode)
+{
+        struct nfs4_pnfs_ds *ds = NULL;
+        char *buf;
+        const char *ipend, *pstr;
+        u32 ip_addr, port;
+        int nlen, rlen, i;
+        int tmp[2];
+        __be32 *r_netid, *r_addr, *p = *pp;
+        /* r_netid */
+        nlen = be32_to_cpup(p++);
+        r_netid = p;
+        p += XDR_QUADLEN(nlen);
+        /* r_addr */
+        rlen = be32_to_cpup(p++);
+        r_addr = p;
+        p += XDR_QUADLEN(rlen);
+        *pp = p;
+        /* Check that netid is "tcp" */
+        if (nlen != 3 ||  memcmp((char *)r_netid, "tcp", 3)) {
+                dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
+                goto out_err;
+        }
+        /* ipv6 length plus port is legal */
+        if (rlen > INET6_ADDRSTRLEN + 8) {
+                dprintk("%s Invalid address, length %d\n", __func__,
+                        rlen);
+                goto out_err;
+        }
+        buf = kmalloc(rlen + 1, GFP_KERNEL);
+        buf[rlen] = '\0';
+        memcpy(buf, r_addr, rlen);
+        /* replace the port dots with dashes for the in4_pton() delimiter*/
+        for (i = 0; i < 2; i++) {
+                char *res = strrchr(buf, '.');
+                *res = '-';
+        }
+        /* Currently only support ipv4 address */
+        if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
+                dprintk("%s: Only ipv4 addresses supported\n", __func__);
+                goto out_free;
+        }
+        /* port */
+        pstr = ipend;
+        sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
+        port = htons((tmp[0] << 8) | (tmp[1]));
+        ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
+        dprintk("%s Decoded address and port %s\n", __func__, buf);
+out_free:
+        kfree(buf);
+out_err:
+        return ds;
+}
+/* Decode opaque device data and return the result */
+static struct nfs4_file_layout_dsaddr*
+decode_device(struct inode *ino, struct pnfs_device *pdev)
+{
+        int i, dummy;
+        u32 cnt, num;
+        u8 *indexp;
+        __be32 *p = (__be32 *)pdev->area, *indicesp;
+        struct nfs4_file_layout_dsaddr *dsaddr;
+        /* Get the stripe count (number of stripe index) */
+        cnt = be32_to_cpup(p++);
+        dprintk("%s stripe count  %d\n", __func__, cnt);
+        if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
+                printk(KERN_WARNING "%s: stripe count %d greater than "
+                       "supported maximum %d\n", __func__,
+                        cnt, NFS4_PNFS_MAX_STRIPE_CNT);
+                goto out_err;
+        }
+        /* Check the multipath list count */
+        indicesp = p;
+        p += XDR_QUADLEN(cnt << 2);
+        num = be32_to_cpup(p++);
+        dprintk("%s ds_num %u\n", __func__, num);
+        if (num > NFS4_PNFS_MAX_MULTI_CNT) {
+                printk(KERN_WARNING "%s: multipath count %d greater than "
+                        "supported maximum %d\n", __func__,
+                        num, NFS4_PNFS_MAX_MULTI_CNT);
+                goto out_err;
+        }
+        dsaddr = kzalloc(sizeof(*dsaddr) +
+                        (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
+                        GFP_KERNEL);
+        if (!dsaddr)
+                goto out_err;
+        dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
+        if (!dsaddr->stripe_indices)
+                goto out_err_free;
+        dsaddr->stripe_count = cnt;
+        dsaddr->ds_num = num;
+        memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+        /* Go back an read stripe indices */
+        p = indicesp;
+        indexp = &dsaddr->stripe_indices[0];
+        for (i = 0; i < dsaddr->stripe_count; i++) {
+                *indexp = be32_to_cpup(p++);
+                if (*indexp >= num)
+                        goto out_err_free;
+                indexp++;
+        }
+        /* Skip already read multipath list count */
+        p++;
+        for (i = 0; i < dsaddr->ds_num; i++) {
+                int j;
+                dummy = be32_to_cpup(p++); /* multipath count */
+                if (dummy > 1) {
+                        printk(KERN_WARNING
+                               "%s: Multipath count %d not supported, "
+                               "skipping all greater than 1\n", __func__,
+                                dummy);
+                }
+                for (j = 0; j < dummy; j++) {
+                        if (j == 0) {
+                                dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
+                                if (dsaddr->ds_list[i] == NULL)
+                                        goto out_err_free;
+                        } else {
+                                u32 len;
+                                /* skip extra multipath */
+                                len = be32_to_cpup(p++);
+                                p += XDR_QUADLEN(len);
+                                len = be32_to_cpup(p++);
+                                p += XDR_QUADLEN(len);
+                                continue;
+                        }
+                }
+        }
+        return dsaddr;
+out_err_free:
+        nfs4_fl_free_deviceid(dsaddr);
+out_err:
+        dprintk("%s ERROR: returning NULL\n", __func__);
+        return NULL;
+}
+/*
+ * Decode the opaque device specified in 'dev'
+ * and add it to the list of available devices.
+ * If the deviceid is already cached, nfs4_add_deviceid will return
+ * a pointer to the cached struct and throw away the new.
+ */
+static struct nfs4_file_layout_dsaddr*
+decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
+{
+        struct nfs4_file_layout_dsaddr *dsaddr;
+        struct pnfs_deviceid_node *d;
+        dsaddr = decode_device(inode, dev);
+        if (!dsaddr) {
+                printk(KERN_WARNING "%s: Could not decode or add device\n",
+                        __func__);
+                return NULL;
+        }
+        d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+                              &dsaddr->deviceid);
+        return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
+/*
+ * Retrieve the information for dev_id, add it to the list
+ * of available devices, and return it.
+ */
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
+{
+        struct pnfs_device *pdev = NULL;
+        u32 max_resp_sz;
+        int max_pages;
+        struct page **pages = NULL;
+        struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+        int rc, i;
+        struct nfs_server *server = NFS_SERVER(inode);
+        /*
+         * Use the session max response size as the basis for setting
+         * GETDEVICEINFO's maxcount
+         */
+        max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+        max_pages = max_resp_sz >> PAGE_SHIFT;
+        dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
+                __func__, inode, max_resp_sz, max_pages);
+        pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL);
+        if (pdev == NULL)
+                return NULL;
+        pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
+        if (pages == NULL) {
+                kfree(pdev);
+                return NULL;
+        }
+        for (i = 0; i < max_pages; i++) {
+                pages[i] = alloc_page(GFP_KERNEL);
+                if (!pages[i])
+                        goto out_free;
+        }
+        /* set pdev->area */
+        pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
+        if (!pdev->area)
+                goto out_free;
+        memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+        pdev->layout_type = LAYOUT_NFSV4_1_FILES;
+        pdev->pages = pages;
+        pdev->pgbase = 0;
+        pdev->pglen = PAGE_SIZE * max_pages;
+        pdev->mincount = 0;
+        rc = nfs4_proc_getdeviceinfo(server, pdev);
+        dprintk("%s getdevice info returns %d\n", __func__, rc);
+        if (rc)
+                goto out_free;
+        /*
+         * Found new device, need to decode it and then add it to the
+         * list of known devices for this mountpoint.
+         */
+        dsaddr = decode_and_add_device(inode, pdev);
+out_free:
+        if (pdev->area != NULL)
+                vunmap(pdev->area);
+        for (i = 0; i < max_pages; i++)
+                __free_page(pages[i]);
+        kfree(pages);
+        kfree(pdev);
+        dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
+        return dsaddr;
+}
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
+{
+        struct pnfs_deviceid_node *d;
+        d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
+        return (d == NULL) ? NULL :
+                container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 089da5b5d20a..0f24cdf2cb13 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -55,6 +55,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "callback.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PROC
@@ -129,7 +130,8 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
                        | FATTR4_WORD0_MAXREAD
                        | FATTR4_WORD0_MAXWRITE
                        | FATTR4_WORD0_LEASE_TIME,
-                        0
+                        FATTR4_WORD1_TIME_DELTA
+                        | FATTR4_WORD1_FS_LAYOUT_TYPES
 };
 const u32 nfs4_fs_locations_bitmap[2] = {
@@ -255,9 +257,6 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                        nfs4_state_mark_reclaim_nograce(clp, state);
                        goto do_state_recovery;
                case -NFS4ERR_STALE_STATEID:
-                        if (state == NULL)
-                                break;
-                        nfs4_state_mark_reclaim_reboot(clp, state);
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
                        goto do_state_recovery;
@@ -334,10 +333,12 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 * Must be called while holding tbl->slot_tbl_lock
 */
 static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
+nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
 {
+        int free_slotid = free_slot - tbl->slots;
        int slotid = free_slotid;
+        BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
        /* clear used bit in bitmap */
        __clear_bit(slotid, tbl->used_slots);
@@ -379,7 +380,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
        struct nfs4_slot_table *tbl;
        tbl = &res->sr_session->fc_slot_table;
-        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
+        if (!res->sr_slot) {
                /* just wake up the next guy waiting since
                 * we may have not consumed a slot after all */
                dprintk("%s: No slot\n", __func__);
@@ -387,17 +388,15 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
        }
        spin_lock(&tbl->slot_tbl_lock);
-        nfs4_free_slot(tbl, res->sr_slotid);
+        nfs4_free_slot(tbl, res->sr_slot);
        nfs41_check_drain_session_complete(res->sr_session);
        spin_unlock(&tbl->slot_tbl_lock);
-        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        res->sr_slot = NULL;
 }
 static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 {
        unsigned long timestamp;
-        struct nfs4_slot_table *tbl;
-        struct nfs4_slot *slot;
        struct nfs_client *clp;
        /*
@@ -410,17 +409,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
                res->sr_status = NFS_OK;
        /* -ERESTARTSYS can result in skipping nfs41_sequence_setup */
-        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
+        if (!res->sr_slot)
                goto out;
-        tbl = &res->sr_session->fc_slot_table;
-        slot = tbl->slots + res->sr_slotid;
        /* Check the SEQUENCE operation status */
        switch (res->sr_status) {
        case 0:
                /* Update the slot's sequence and clientid lease timer */
-                ++slot->seq_nr;
+                ++res->sr_slot->seq_nr;
                timestamp = res->sr_renewal_time;
                clp = res->sr_session->clp;
                do_renew_lease(clp, timestamp);
@@ -433,12 +429,14 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
                 * returned NFS4ERR_DELAY as per Section 2.10.6.2
                 * of RFC5661.
                 */
-                dprintk("%s: slot=%d seq=%d: Operation in progress\n",
+                dprintk("%s: slot=%td seq=%d: Operation in progress\n",
-                                __func__, res->sr_slotid, slot->seq_nr);
+                        __func__,
+                        res->sr_slot - res->sr_session->fc_slot_table.slots,
+                        res->sr_slot->seq_nr);
                goto out_retry;
        default:
                /* Just update the slot sequence no. */
-                ++slot->seq_nr;
+                ++res->sr_slot->seq_nr;
        }
 out:
        /* The session may be reset by one of the error handlers. */
@@ -505,10 +503,9 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        dprintk("--> %s\n", __func__);
        /* slot already allocated? */
-        if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
+        if (res->sr_slot != NULL)
                return 0;
-        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
        tbl = &session->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
@@ -550,7 +547,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
        res->sr_session = session;
-        res->sr_slotid = slotid;
+        res->sr_slot = slot;
        res->sr_renewal_time = jiffies;
        res->sr_status_flags = 0;
        /*
@@ -576,8 +573,9 @@ int nfs4_setup_sequence(const struct nfs_server *server,
                goto out;
        }
-        dprintk("--> %s clp %p session %p sr_slotid %d\n",
+        dprintk("--> %s clp %p session %p sr_slot %td\n",
-                __func__, session->clp, session, res->sr_slotid);
+                __func__, session->clp, session, res->sr_slot ?
+                        res->sr_slot - session->fc_slot_table.slots : -1);
        ret = nfs41_setup_sequence(session, args, res, cache_reply,
                                   task);
@@ -650,7 +648,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,
                .callback_data = &data
        };
-        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        res->sr_slot = NULL;
        if (privileged)
                task_setup.callback_ops = &nfs41_call_priv_sync_ops;
        task = rpc_run_task(&task_setup);
@@ -735,7 +733,6 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
        p->o_res.server = p->o_arg.server;
        nfs_fattr_init(&p->f_attr);
        nfs_fattr_init(&p->dir_attr);
-        p->o_res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
@@ -1120,6 +1117,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
        clear_bit(NFS_DELEGATED_STATE, &state->flags);
        smp_rmb();
        if (state->n_rdwr != 0) {
+                clear_bit(NFS_O_RDWR_STATE, &state->flags);
                ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
                if (ret != 0)
                        return ret;
@@ -1127,6 +1125,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
                        return -ESTALE;
        }
        if (state->n_wronly != 0) {
+                clear_bit(NFS_O_WRONLY_STATE, &state->flags);
                ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
                if (ret != 0)
                        return ret;
@@ -1134,6 +1133,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
                        return -ESTALE;
        }
        if (state->n_rdonly != 0) {
+                clear_bit(NFS_O_RDONLY_STATE, &state->flags);
                ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
                if (ret != 0)
                        return ret;
@@ -1188,7 +1188,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
        int err;
        do {
                err = _nfs4_do_open_reclaim(ctx, state);
-                if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
+                if (err != -NFS4ERR_DELAY)
                        break;
                nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
@@ -1258,6 +1258,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                        case -EKEYEXPIRED:
+                                /*
+                                 * User RPCSEC_GSS context has expired.
+                                 * We cannot recover this stateid now, so
+                                 * skip it and allow recovery thread to
+                                 * proceed.
+                                 */
                        case -ENOMEM:
                                err = 0;
                                goto out;
@@ -1605,7 +1612,6 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
                        goto out;
                case -NFS4ERR_GRACE:
                case -NFS4ERR_DELAY:
-                case -EKEYEXPIRED:
                        nfs4_handle_exception(server, err, &exception);
                        err = 0;
                }
@@ -1975,7 +1981,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
-        calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        path_get(path);
        calldata->path = *path;
@@ -1998,120 +2003,17 @@ out:
        return status;
 }
-static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode)
+static struct inode *
+nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
 {
-        struct file *filp;
-        int ret;
-        /* If the open_intent is for execute, we have an extra check to make */
-        if (fmode & FMODE_EXEC) {
-                ret = nfs_may_open(state->inode,
-                                state->owner->so_cred,
-                                nd->intent.open.flags);
-                if (ret < 0)
-                        goto out_close;
-        }
-        filp = lookup_instantiate_filp(nd, path->dentry, NULL);
-        if (!IS_ERR(filp)) {
-                struct nfs_open_context *ctx;
-                ctx = nfs_file_open_context(filp);
-                ctx->state = state;
-                return 0;
-        }
-        ret = PTR_ERR(filp);
-out_close:
-        nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
-        return ret;
-}
-struct dentry *
-nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        struct path path = {
-                .mnt = nd->path.mnt,
-                .dentry = dentry,
-        };
-        struct dentry *parent;
-        struct iattr attr;
-        struct rpc_cred *cred;
        struct nfs4_state *state;
-        struct dentry *res;
-        int open_flags = nd->intent.open.flags;
-        fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
-        if (nd->flags & LOOKUP_CREATE) {
-                attr.ia_mode = nd->intent.open.create_mode;
-                attr.ia_valid = ATTR_MODE;
-                if (!IS_POSIXACL(dir))
-                        attr.ia_mode &= ~current_umask();
-        } else {
-                open_flags &= ~O_EXCL;
-                attr.ia_valid = 0;
-                BUG_ON(open_flags & O_CREAT);
-        }
-        cred = rpc_lookup_cred();
-        if (IS_ERR(cred))
-                return (struct dentry *)cred;
-        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
-        nfs_block_sillyrename(parent);
+        state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred);
-        state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred);
+        if (IS_ERR(state))
-        put_rpccred(cred);
+                return ERR_CAST(state);
-        if (IS_ERR(state)) {
+        ctx->state = state;
-                if (PTR_ERR(state) == -ENOENT) {
+        return igrab(state->inode);
-                        d_add(dentry, NULL);
-                        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-                }
-                nfs_unblock_sillyrename(parent);
-                return (struct dentry *)state;
-        }
-        res = d_add_unique(dentry, igrab(state->inode));
-        if (res != NULL)
-                path.dentry = res;
-        nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
-        nfs_unblock_sillyrename(parent);
-        nfs4_intent_set_file(nd, &path, state, fmode);
-        return res;
-}
-int
-nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd)
-{
-        struct path path = {
-                .mnt = nd->path.mnt,
-                .dentry = dentry,
-        };
-        struct rpc_cred *cred;
-        struct nfs4_state *state;
-        fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
-        cred = rpc_lookup_cred();
-        if (IS_ERR(cred))
-                return PTR_ERR(cred);
-        state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
-        put_rpccred(cred);
-        if (IS_ERR(state)) {
-                switch (PTR_ERR(state)) {
-                        case -EPERM:
-                        case -EACCES:
-                        case -EDQUOT:
-                        case -ENOSPC:
-                        case -EROFS:
-                                return PTR_ERR(state);
-                        default:
-                                goto out_drop;
-                }
-        }
-        if (state->inode == dentry->d_inode) {
-                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-                nfs4_intent_set_file(nd, &path, state, fmode);
-                return 1;
-        }
-        nfs4_close_sync(&path, state, fmode);
-out_drop:
-        d_drop(dentry);
-        return 0;
 }
 static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2568,36 +2470,34 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page,
 static int
 nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-                 int flags, struct nameidata *nd)
+                 int flags, struct nfs_open_context *ctx)
 {
-        struct path path = {
+        struct path my_path = {
-                .mnt = nd->path.mnt,
                .dentry = dentry,
        };
+        struct path *path = &my_path;
        struct nfs4_state *state;
-        struct rpc_cred *cred;
+        struct rpc_cred *cred = NULL;
-        fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE);
+        fmode_t fmode = 0;
        int status = 0;
-        cred = rpc_lookup_cred();
+        if (ctx != NULL) {
-        if (IS_ERR(cred)) {
+                cred = ctx->cred;
-                status = PTR_ERR(cred);
+                path = &ctx->path;
-                goto out;
+                fmode = ctx->mode;
        }
-        state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred);
+        state = nfs4_do_open(dir, path, fmode, flags, sattr, cred);
        d_drop(dentry);
        if (IS_ERR(state)) {
                status = PTR_ERR(state);
-                goto out_putcred;
+                goto out;
        }
        d_add(dentry, igrab(state->inode));
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-        if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
+        if (ctx != NULL)
-                status = nfs4_intent_set_file(nd, &path, state, fmode);
+                ctx->state = state;
        else
-                nfs4_close_sync(&path, state, fmode);
+                nfs4_close_sync(path, state, fmode);
-out_putcred:
-        put_rpccred(cred);
 out:
        return status;
 }
@@ -2655,6 +2555,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
        args->bitmask = server->cache_consistency_bitmask;
        res->server = server;
+        res->seq_res.sr_slot = NULL;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 }
@@ -2671,18 +2572,46 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        return 1;
 }
+static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+        struct nfs_server *server = NFS_SERVER(dir);
+        struct nfs_renameargs *arg = msg->rpc_argp;
+        struct nfs_renameres *res = msg->rpc_resp;
+        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
+        arg->bitmask = server->attr_bitmask;
+        res->server = server;
+}
+static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+                                 struct inode *new_dir)
+{
+        struct nfs_renameres *res = task->tk_msg.rpc_resp;
+        if (!nfs4_sequence_done(task, &res->seq_res))
+                return 0;
+        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
+                return 0;
+        update_changeattr(old_dir, &res->old_cinfo);
+        nfs_post_op_update_inode(old_dir, res->old_fattr);
+        update_changeattr(new_dir, &res->new_cinfo);
+        nfs_post_op_update_inode(new_dir, res->new_fattr);
+        return 1;
+}
 static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
                struct inode *new_dir, struct qstr *new_name)
 {
        struct nfs_server *server = NFS_SERVER(old_dir);
-        struct nfs4_rename_arg arg = {
+        struct nfs_renameargs arg = {
                .old_dir = NFS_FH(old_dir),
                .new_dir = NFS_FH(new_dir),
                .old_name = old_name,
                .new_name = new_name,
                .bitmask = server->attr_bitmask,
        };
-        struct nfs4_rename_res res = {
+        struct nfs_renameres res = {
                .server = server,
        };
        struct rpc_message msg = {
@@ -2896,15 +2825,16 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 }
 static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                  u64 cookie, struct page *page, unsigned int count, int plus)
+                u64 cookie, struct page **pages, unsigned int count, int plus)
 {
        struct inode            *dir = dentry->d_inode;
        struct nfs4_readdir_arg args = {
                .fh = NFS_FH(dir),
-                .pages = &page,
+                .pages = pages,
                .pgbase = 0,
                .count = count,
                .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+                .plus = plus,
        };
        struct nfs4_readdir_res res;
        struct rpc_message msg = {
@@ -2932,14 +2862,14 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 }
 static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                  u64 cookie, struct page *page, unsigned int count, int plus)
+                u64 cookie, struct page **pages, unsigned int count, int plus)
 {
        struct nfs4_exception exception = { };
        int err;
        do {
                err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode),
                                _nfs4_proc_readdir(dentry, cred, cookie,
-                                        page, count, plus),
+                                        pages, count, plus),
                                &exception);
        } while (exception.retry);
        return err;
@@ -3490,9 +3420,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                        nfs4_state_mark_reclaim_nograce(clp, state);
                        goto do_state_recovery;
                case -NFS4ERR_STALE_STATEID:
-                        if (state == NULL)
-                                break;
-                        nfs4_state_mark_reclaim_reboot(clp, state);
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
                        goto do_state_recovery;
@@ -3626,7 +3553,6 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
                        case -NFS4ERR_RESOURCE:
                                /* The IBM lawyers misread another document! */
                        case -NFS4ERR_DELAY:
-                        case -EKEYEXPIRED:
                                err = nfs4_delay(clp->cl_rpcclient, &timeout);
                }
        } while (err == 0);
@@ -3721,7 +3647,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        memcpy(&data->stateid, stateid, sizeof(data->stateid));
        data->res.fattr = &data->fattr;
        data->res.server = server;
-        data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        nfs_fattr_init(data->res.fattr);
        data->timestamp = jiffies;
        data->rpc_status = 0;
@@ -3874,7 +3799,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
        p->arg.fl = &p->fl;
        p->arg.seqid = seqid;
        p->res.seqid = seqid;
-        p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        p->arg.stateid = &lsp->ls_stateid;
        p->lsp = lsp;
        atomic_inc(&lsp->ls_count);
@@ -4054,7 +3978,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
        p->arg.lock_owner.id = lsp->ls_id.id;
        p->res.lock_seqid = p->arg.lock_seqid;
-        p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        p->lsp = lsp;
        p->server = server;
        atomic_inc(&lsp->ls_count);
@@ -4241,7 +4164,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
                if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
                        return 0;
                err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
-                if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
+                if (err != -NFS4ERR_DELAY)
                        break;
                nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
@@ -4266,7 +4189,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
                        goto out;
                case -NFS4ERR_GRACE:
                case -NFS4ERR_DELAY:
-                case -EKEYEXPIRED:
                        nfs4_handle_exception(server, err, &exception);
                        err = 0;
                }
@@ -4412,13 +4334,21 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
                                err = 0;
                                goto out;
+                        case -EKEYEXPIRED:
+                                /*
+                                 * User RPCSEC_GSS context has expired.
+                                 * We cannot recover this stateid now, so
+                                 * skip it and allow recovery thread to
+                                 * proceed.
+                                 */
+                                err = 0;
+                                goto out;
                        case -ENOMEM:
                        case -NFS4ERR_DENIED:
                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
                                err = 0;
                                goto out;
                        case -NFS4ERR_DELAY:
-                        case -EKEYEXPIRED:
                                break;
                }
                err = nfs4_handle_exception(server, err, &exception);
@@ -4647,7 +4577,6 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
        switch (task->tk_status) {
        case -NFS4ERR_DELAY:
        case -NFS4ERR_GRACE:
-        case -EKEYEXPIRED:
                dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
                rpc_delay(task, NFS4_POLL_RETRY_MIN);
                task->tk_status = 0;
@@ -4687,7 +4616,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
        };
        int status;
-        res.lr_seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        dprintk("--> %s\n", __func__);
        task = rpc_run_task(&task_setup);
@@ -4914,49 +4842,56 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
                args->bc_attrs.max_reqs);
 }
-static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd)
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
 {
-        if (rcvd <= sent)
+        struct nfs4_channel_attrs *sent = &args->fc_attrs;
-                return 0;
+        struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
-        printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. "
-                "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd);
+        if (rcvd->headerpadsz > sent->headerpadsz)
-        return -EINVAL;
+                return -EINVAL;
+        if (rcvd->max_resp_sz > sent->max_resp_sz)
+                return -EINVAL;
+        /*
+         * Our requested max_ops is the minimum we need; we're not
+         * prepared to break up compounds into smaller pieces than that.
+         * So, no point even trying to continue if the server won't
+         * cooperate:
+         */
+        if (rcvd->max_ops < sent->max_ops)
+                return -EINVAL;
+        if (rcvd->max_reqs == 0)
+                return -EINVAL;
+        return 0;
 }
-#define _verify_fore_channel_attr(_name_) \
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
-        _verify_channel_attr("fore", #_name_, \
+{
-                             args->fc_attrs._name_, \
+        struct nfs4_channel_attrs *sent = &args->bc_attrs;
-                             session->fc_attrs._name_)
+        struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
-#define _verify_back_channel_attr(_name_) \
+        if (rcvd->max_rqst_sz > sent->max_rqst_sz)
-        _verify_channel_attr("back", #_name_, \
+                return -EINVAL;
-                             args->bc_attrs._name_, \
+        if (rcvd->max_resp_sz < sent->max_resp_sz)
-                             session->bc_attrs._name_)
+                return -EINVAL;
+        if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
+                return -EINVAL;
+        /* These would render the backchannel useless: */
+        if (rcvd->max_ops  == 0)
+                return -EINVAL;
+        if (rcvd->max_reqs == 0)
+                return -EINVAL;
+        return 0;
+}
-/*
- * The server is not allowed to increase the fore channel header pad size,
- * maximum response size, or maximum number of operations.
- *
- * The back channel attributes are only negotiatied down: We send what the
- * (back channel) server insists upon.
- */
 static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
                                     struct nfs4_session *session)
 {
-        int ret = 0;
+        int ret;
-        ret |= _verify_fore_channel_attr(headerpadsz);
-        ret |= _verify_fore_channel_attr(max_resp_sz);
-        ret |= _verify_fore_channel_attr(max_ops);
-        ret |= _verify_back_channel_attr(headerpadsz);
-        ret |= _verify_back_channel_attr(max_rqst_sz);
-        ret |= _verify_back_channel_attr(max_resp_sz);
-        ret |= _verify_back_channel_attr(max_resp_sz_cached);
-        ret |= _verify_back_channel_attr(max_ops);
-        ret |= _verify_back_channel_attr(max_reqs);
-        return ret;
+        ret = nfs4_verify_fore_channel_attrs(args, session);
+        if (ret)
+                return ret;
+        return nfs4_verify_back_channel_attrs(args, session);
 }
 static int _nfs4_proc_create_session(struct nfs_client *clp)
@@ -5111,7 +5046,6 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
 {
        switch(task->tk_status) {
        case -NFS4ERR_DELAY:
-        case -EKEYEXPIRED:
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
@@ -5180,12 +5114,11 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
        if (!atomic_inc_not_zero(&clp->cl_count))
                return ERR_PTR(-EIO);
-        calldata = kmalloc(sizeof(*calldata), GFP_NOFS);
+        calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
        if (calldata == NULL) {
                nfs_put_client(clp);
                return ERR_PTR(-ENOMEM);
        }
-        calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        msg.rpc_argp = &calldata->args;
        msg.rpc_resp = &calldata->res;
        calldata->clp = clp;
@@ -5254,7 +5187,6 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
        case -NFS4ERR_WRONG_CRED: /* What to do here? */
                break;
        case -NFS4ERR_DELAY:
-        case -EKEYEXPIRED:
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
@@ -5317,7 +5249,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
                goto out;
        calldata->clp = clp;
        calldata->arg.one_fs = 0;
-        calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        msg.rpc_argp = &calldata->arg;
        msg.rpc_resp = &calldata->res;
@@ -5333,6 +5264,147 @@ out:
        dprintk("<-- %s status=%d\n", __func__, status);
        return status;
 }
+static void
+nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_layoutget *lgp = calldata;
+        struct inode *ino = lgp->args.inode;
+        struct nfs_server *server = NFS_SERVER(ino);
+        dprintk("--> %s\n", __func__);
+        if (nfs4_setup_sequence(server, &lgp->args.seq_args,
+                                &lgp->res.seq_res, 0, task))
+                return;
+        rpc_call_start(task);
+}
+static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_layoutget *lgp = calldata;
+        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+        dprintk("--> %s\n", __func__);
+        if (!nfs4_sequence_done(task, &lgp->res.seq_res))
+                return;
+        switch (task->tk_status) {
+        case 0:
+                break;
+        case -NFS4ERR_LAYOUTTRYLATER:
+        case -NFS4ERR_RECALLCONFLICT:
+                task->tk_status = -NFS4ERR_DELAY;
+                /* Fall through */
+        default:
+                if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+                        rpc_restart_call_prepare(task);
+                        return;
+                }
+        }
+        lgp->status = task->tk_status;
+        dprintk("<-- %s\n", __func__);
+}
+static void nfs4_layoutget_release(void *calldata)
+{
+        struct nfs4_layoutget *lgp = calldata;
+        dprintk("--> %s\n", __func__);
+        put_layout_hdr(lgp->args.inode);
+        if (lgp->res.layout.buf != NULL)
+                free_page((unsigned long) lgp->res.layout.buf);
+        put_nfs_open_context(lgp->args.ctx);
+        kfree(calldata);
+        dprintk("<-- %s\n", __func__);
+}
+static const struct rpc_call_ops nfs4_layoutget_call_ops = {
+        .rpc_call_prepare = nfs4_layoutget_prepare,
+        .rpc_call_done = nfs4_layoutget_done,
+        .rpc_release = nfs4_layoutget_release,
+};
+int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
+{
+        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
+                .rpc_argp = &lgp->args,
+                .rpc_resp = &lgp->res,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = server->client,
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_layoutget_call_ops,
+                .callback_data = lgp,
+                .flags = RPC_TASK_ASYNC,
+        };
+        int status = 0;
+        dprintk("--> %s\n", __func__);
+        lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
+        if (lgp->res.layout.buf == NULL) {
+                nfs4_layoutget_release(lgp);
+                return -ENOMEM;
+        }
+        lgp->res.seq_res.sr_slot = NULL;
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task))
+                return PTR_ERR(task);
+        status = nfs4_wait_for_completion_rpc_task(task);
+        if (status != 0)
+                goto out;
+        status = lgp->status;
+        if (status != 0)
+                goto out;
+        status = pnfs_layout_process(lgp);
+out:
+        rpc_put_task(task);
+        dprintk("<-- %s status=%d\n", __func__, status);
+        return status;
+}
+static int
+_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+        struct nfs4_getdeviceinfo_args args = {
+                .pdev = pdev,
+        };
+        struct nfs4_getdeviceinfo_res res = {
+                .pdev = pdev,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        int status;
+        dprintk("--> %s\n", __func__);
+        status = nfs4_call_sync(server, &msg, &args, &res, 0);
+        dprintk("<-- %s status=%d\n", __func__, status);
+        return status;
+}
+int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+        struct nfs4_exception exception = { };
+        int err;
+        do {
+                err = nfs4_handle_exception(server,
+                                        _nfs4_proc_getdeviceinfo(server, pdev),
+                                        &exception);
+        } while (exception.retry);
+        return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
 #endif /* CONFIG_NFS_V4_1 */
 struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5443,6 +5515,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .unlink_setup   = nfs4_proc_unlink_setup,
        .unlink_done    = nfs4_proc_unlink_done,
        .rename         = nfs4_proc_rename,
+        .rename_setup   = nfs4_proc_rename_setup,
+        .rename_done    = nfs4_proc_rename_done,
        .link           = nfs4_proc_link,
        .symlink        = nfs4_proc_symlink,
        .mkdir          = nfs4_proc_mkdir,
@@ -5463,6 +5537,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .lock           = nfs4_proc_lock,
        .clear_acl_cache = nfs4_zap_acl_attr,
        .close_context  = nfs4_close_context,
+        .open_context   = nfs4_atomic_open,
 };
 /*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3e2f19b04c06..f575a3126737 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -40,12 +40,13 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/fs.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_idmap.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/ratelimit.h>
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
@@ -53,6 +54,7 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "pnfs.h"
 #define OPENOWNER_POOL_SIZE     8
@@ -970,13 +972,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
        /* Guard against delegation returns and new lock/unlock calls */
        down_write(&nfsi->rwsem);
        /* Protect inode->i_flock using the BKL */
-        lock_kernel();
+        lock_flocks();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
                if (nfs_file_open_context(fl->fl_file)->state != state)
                        continue;
-                unlock_kernel();
+                unlock_flocks();
                status = ops->recover_lock(state, fl);
                switch (status) {
                        case 0:
@@ -1003,9 +1005,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
                                status = 0;
                }
-                lock_kernel();
+                lock_flocks();
        }
-        unlock_kernel();
+        unlock_flocks();
 out:
        up_write(&nfsi->rwsem);
        return status;
@@ -1063,6 +1065,14 @@ restart:
                                /* Mark the file as being 'closed' */
                                state->state = 0;
                                break;
+                        case -EKEYEXPIRED:
+                                /*
+                                 * User RPCSEC_GSS context has expired.
+                                 * We cannot recover this stateid now, so
+                                 * skip it and allow recovery thread to
+                                 * proceed.
+                                 */
+                                break;
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_BAD_STATEID:
@@ -1138,16 +1148,14 @@ static void nfs4_reclaim_complete(struct nfs_client *clp,
                (void)ops->reclaim_complete(clp);
 }
-static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
-                return;
+                return 0;
-        nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
@@ -1161,6 +1169,14 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
        }
        nfs_delegation_reap_unclaimed(clp);
+        return 1;
+}
+static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+{
+        if (!nfs4_state_clear_reclaim_reboot(clp))
+                return;
+        nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
 }
 static void nfs_delegation_clear_all(struct nfs_client *clp)
@@ -1175,6 +1191,14 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
        nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
 }
+static void nfs4_warn_keyexpired(const char *s)
+{
+        printk_ratelimited(KERN_WARNING "Error: state manager"
+                        " encountered RPCSEC_GSS session"
+                        " expired against NFSv4 server %s.\n",
+                        s);
+}
 static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 {
        switch (error) {
@@ -1187,7 +1211,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_LEASE_MOVED:
                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-                        nfs4_state_end_reclaim_reboot(clp);
+                        nfs4_state_clear_reclaim_reboot(clp);
                        nfs4_state_start_reclaim_reboot(clp);
                        break;
                case -NFS4ERR_EXPIRED:
@@ -1204,6 +1228,10 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
                        set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
                        /* Zero session reset errors */
                        return 0;
+                case -EKEYEXPIRED:
+                        /* Nothing we can do */
+                        nfs4_warn_keyexpired(clp->cl_hostname);
+                        return 0;
        }
        return error;
 }
@@ -1414,9 +1442,10 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
                case -NFS4ERR_DELAY:
                case -NFS4ERR_CLID_INUSE:
                case -EAGAIN:
-                case -EKEYEXPIRED:
                        break;
+                case -EKEYEXPIRED:
+                        nfs4_warn_keyexpired(clp->cl_hostname);
                case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
                                         * in nfs4_exchange_id */
                default:
@@ -1447,6 +1476,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        }
                        clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
                        set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+                        pnfs_destroy_all_layouts(clp);
                }
                if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 08ef91291132..f313c4cce7e4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,6 +52,7 @@
 #include <linux/nfs_idmap.h>
 #include "nfs4_fs.h"
 #include "internal.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_XDR
@@ -310,6 +311,19 @@ static int nfs4_stat_to_errno(int);
                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz   (op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz   (op_decode_hdr_maxsz + 4)
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
+                                XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
+                                1 /* layout type */ + \
+                                1 /* opaque devaddr4 length */ + \
+                                  /* devaddr4 payload is read into page */ \
+                                1 /* notification bitmap length */ + \
+                                1 /* notification bitmap */)
+#define encode_layoutget_maxsz  (op_encode_hdr_maxsz + 10 + \
+                                encode_stateid_maxsz)
+#define decode_layoutget_maxsz  (op_decode_hdr_maxsz + 8 + \
+                                decode_stateid_maxsz + \
+                                XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz   0
 #define decode_sequence_maxsz   0
@@ -699,6 +713,20 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_reclaim_complete_sz    (compound_decode_hdr_maxsz + \
                                         decode_sequence_maxsz + \
                                         decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
+                                encode_sequence_maxsz +\
+                                encode_getdeviceinfo_maxsz)
+#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz +    \
+                                decode_sequence_maxsz + \
+                                decode_getdeviceinfo_maxsz)
+#define NFS4_enc_layoutget_sz   (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
+                                encode_putfh_maxsz +        \
+                                encode_layoutget_maxsz)
+#define NFS4_dec_layoutget_sz   (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
+                                decode_putfh_maxsz +        \
+                                decode_layoutget_maxsz)
 const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
                                      compound_encode_hdr_maxsz +
@@ -816,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
        if (iap->ia_valid & ATTR_MODE)
                len += 4;
        if (iap->ia_valid & ATTR_UID) {
-                owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name);
+                owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
                if (owner_namelen < 0) {
                        dprintk("nfs: couldn't resolve uid %d to string\n",
                                        iap->ia_uid);
@@ -828,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
        }
        if (iap->ia_valid & ATTR_GID) {
-                owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group);
+                owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
                if (owner_grouplen < 0) {
                        dprintk("nfs: couldn't resolve gid %d to string\n",
                                        iap->ia_gid);
@@ -1385,24 +1413,35 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
-        uint32_t attrs[2] = {
+        uint32_t attrs[2] = {0, 0};
-                FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID,
+        uint32_t dircount = readdir->count >> 1;
-                FATTR4_WORD1_MOUNTED_ON_FILEID,
-        };
        __be32 *p;
+        if (readdir->plus) {
+                attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
+                        FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE;
+                attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
+                        FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
+                        FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
+                        FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+                dircount >>= 1;
+        }
+        attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID;
+        attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
+        /* Switch to mounted_on_fileid if the server supports it */
+        if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
+                attrs[0] &= ~FATTR4_WORD0_FILEID;
+        else
+                attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
        p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
        *p++ = cpu_to_be32(OP_READDIR);
        p = xdr_encode_hyper(p, readdir->cookie);
        p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
-        *p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
+        *p++ = cpu_to_be32(dircount);
        *p++ = cpu_to_be32(readdir->count);
        *p++ = cpu_to_be32(2);
-        /* Switch to mounted_on_fileid if the server supports it */
-        if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
-                attrs[0] &= ~FATTR4_WORD0_FILEID;
-        else
-                attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
        *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
        *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
        hdr->nops++;
@@ -1726,6 +1765,58 @@ static void encode_sequence(struct xdr_stream *xdr,
 #endif /* CONFIG_NFS_V4_1 */
 }
+#ifdef CONFIG_NFS_V4_1
+static void
+encode_getdeviceinfo(struct xdr_stream *xdr,
+                     const struct nfs4_getdeviceinfo_args *args,
+                     struct compound_hdr *hdr)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
+        *p++ = cpu_to_be32(OP_GETDEVICEINFO);
+        p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
+                                    NFS4_DEVICEID4_SIZE);
+        *p++ = cpu_to_be32(args->pdev->layout_type);
+        *p++ = cpu_to_be32(args->pdev->pglen);          /* gdia_maxcount */
+        *p++ = cpu_to_be32(0);                          /* bitmap length 0 */
+        hdr->nops++;
+        hdr->replen += decode_getdeviceinfo_maxsz;
+}
+static void
+encode_layoutget(struct xdr_stream *xdr,
+                      const struct nfs4_layoutget_args *args,
+                      struct compound_hdr *hdr)
+{
+        nfs4_stateid stateid;
+        __be32 *p;
+        p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
+        *p++ = cpu_to_be32(OP_LAYOUTGET);
+        *p++ = cpu_to_be32(0);     /* Signal layout available */
+        *p++ = cpu_to_be32(args->type);
+        *p++ = cpu_to_be32(args->range.iomode);
+        p = xdr_encode_hyper(p, args->range.offset);
+        p = xdr_encode_hyper(p, args->range.length);
+        p = xdr_encode_hyper(p, args->minlength);
+        pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
+                                args->ctx->state);
+        p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
+        *p = cpu_to_be32(args->maxcount);
+        dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
+                __func__,
+                args->type,
+                args->range.iomode,
+                (unsigned long)args->range.offset,
+                (unsigned long)args->range.length,
+                args->maxcount);
+        hdr->nops++;
+        hdr->replen += decode_layoutget_maxsz;
+}
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * END OF "GENERIC" ENCODE ROUTINES.
 */
@@ -1823,7 +1914,7 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
 /*
 * Encode RENAME request
 */
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args)
+static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -2543,6 +2634,51 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
        return 0;
 }
+/*
+ * Encode GETDEVICEINFO request
+ */
+static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
+                                      struct nfs4_getdeviceinfo_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_getdeviceinfo(&xdr, args, &hdr);
+        /* set up reply kvec. Subtract notification bitmap max size (2)
+         * so that notification bitmap is put in xdr_buf tail */
+        xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
+                         args->pdev->pages, args->pdev->pgbase,
+                         args->pdev->pglen);
+        encode_nops(&hdr);
+        return 0;
+}
+/*
+ *  Encode LAYOUTGET request
+ */
+static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
+                                  struct nfs4_layoutget_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_sequence(&xdr, &args->seq_args, &hdr);
+        encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
+        encode_layoutget(&xdr, args, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2676,7 +2812,10 @@ out_overflow:
 static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
 {
        if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) {
-                decode_attr_bitmap(xdr, bitmask);
+                int ret;
+                ret = decode_attr_bitmap(xdr, bitmask);
+                if (unlikely(ret < 0))
+                        return ret;
                bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
        } else
                bitmask[0] = bitmask[1] = 0;
@@ -2848,6 +2987,56 @@ out_overflow:
        return -EIO;
 }
+static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+        __be32 *p;
+        if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U)))
+                return -EIO;
+        if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
+{
+        __be32 *p;
+        int len;
+        if (fh != NULL)
+                memset(fh, 0, sizeof(*fh));
+        if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
+                return -EIO;
+        if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                len = be32_to_cpup(p);
+                if (len > NFS4_FHSIZE)
+                        return -EIO;
+                p = xdr_inline_decode(xdr, len);
+                if (unlikely(!p))
+                        goto out_overflow;
+                if (fh != NULL) {
+                        memcpy(fh->data, p, len);
+                        fh->size = len;
+                }
+                bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
        __be32 *p;
@@ -3521,6 +3710,24 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
        return status;
 }
+static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
+                                  struct timespec *time)
+{
+        int status = 0;
+        time->tv_sec = 0;
+        time->tv_nsec = 0;
+        if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U)))
+                return -EIO;
+        if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) {
+                status = decode_attr_time(xdr, time);
+                bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
+        }
+        dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec,
+                (long)time->tv_nsec);
+        return status;
+}
 static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
 {
        int status = 0;
@@ -3744,29 +3951,14 @@ xdr_error:
        return status;
 }
-static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
+                struct nfs_fattr *fattr, struct nfs_fh *fh,
                const struct nfs_server *server, int may_sleep)
 {
-        __be32 *savep;
-        uint32_t attrlen,
-                 bitmap[2] = {0},
-                 type;
        int status;
        umode_t fmode = 0;
        uint64_t fileid;
+        uint32_t type;
-        status = decode_op_hdr(xdr, OP_GETATTR);
-        if (status < 0)
-                goto xdr_error;
-        status = decode_attr_bitmap(xdr, bitmap);
-        if (status < 0)
-                goto xdr_error;
-        status = decode_attr_length(xdr, &attrlen, &savep);
-        if (status < 0)
-                goto xdr_error;
        status = decode_attr_type(xdr, bitmap, &type);
        if (status < 0)
@@ -3792,6 +3984,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
                goto xdr_error;
        fattr->valid |= status;
+        status = decode_attr_error(xdr, bitmap);
+        if (status < 0)
+                goto xdr_error;
+        status = decode_attr_filehandle(xdr, bitmap, fh);
+        if (status < 0)
+                goto xdr_error;
        status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
        if (status < 0)
                goto xdr_error;
@@ -3862,12 +4062,101 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
                fattr->valid |= status;
        }
+xdr_error:
+        dprintk("%s: xdr returned %d\n", __func__, -status);
+        return status;
+}
+static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+                struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
+{
+        __be32 *savep;
+        uint32_t attrlen,
+                 bitmap[2] = {0};
+        int status;
+        status = decode_op_hdr(xdr, OP_GETATTR);
+        if (status < 0)
+                goto xdr_error;
+        status = decode_attr_bitmap(xdr, bitmap);
+        if (status < 0)
+                goto xdr_error;
+        status = decode_attr_length(xdr, &attrlen, &savep);
+        if (status < 0)
+                goto xdr_error;
+        status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
+        if (status < 0)
+                goto xdr_error;
        status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
        dprintk("%s: xdr returned %d\n", __func__, -status);
        return status;
 }
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+                const struct nfs_server *server, int may_sleep)
+{
+        return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
+}
+/*
+ * Decode potentially multiple layout types. Currently we only support
+ * one layout driver per file system.
+ */
+static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
+                                         uint32_t *layouttype)
+{
+        uint32_t *p;
+        int num;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        num = be32_to_cpup(p);
+        /* pNFS is not supported by the underlying file system */
+        if (num == 0) {
+                *layouttype = 0;
+                return 0;
+        }
+        if (num > 1)
+                printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
+                        "per filesystem not supported\n", __func__);
+        /* Decode and set first layout type, move xdr->p past unused types */
+        p = xdr_inline_decode(xdr, num * 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        *layouttype = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+/*
+ * The type of file system exported.
+ * Note we must ensure that layouttype is set in any non-error case.
+ */
+static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
+                                uint32_t *layouttype)
+{
+        int status = 0;
+        dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
+        if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
+                return -EIO;
+        if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+                status = decode_first_pnfs_layout_type(xdr, layouttype);
+                bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
+        } else
+                *layouttype = 0;
+        return status;
+}
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
@@ -3894,6 +4183,12 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
        if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
                goto xdr_error;
        fsinfo->wtpref = fsinfo->wtmax;
+        status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
+        if (status != 0)
+                goto xdr_error;
+        status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
+        if (status != 0)
+                goto xdr_error;
        status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
@@ -3950,13 +4245,13 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
        __be32 *p;
        uint32_t namelen, type;
-        p = xdr_inline_decode(xdr, 32);
+        p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
        if (unlikely(!p))
                goto out_overflow;
-        p = xdr_decode_hyper(p, &offset);
+        p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
        p = xdr_decode_hyper(p, &length);
-        type = be32_to_cpup(p++);
+        type = be32_to_cpup(p++); /* 4 byte read */
-        if (fl != NULL) {
+        if (fl != NULL) { /* manipulate file lock */
                fl->fl_start = (loff_t)offset;
                fl->fl_end = fl->fl_start + (loff_t)length - 1;
                if (length == ~(uint64_t)0)
@@ -3966,9 +4261,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
                        fl->fl_type = F_RDLCK;
                fl->fl_pid = 0;
        }
-        p = xdr_decode_hyper(p, &clientid);
+        p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
-        namelen = be32_to_cpup(p);
+        namelen = be32_to_cpup(p); /* read 4 bytes */  /* have read all 32 bytes now */
-        p = xdr_inline_decode(xdr, namelen);
+        p = xdr_inline_decode(xdr, namelen); /* variable size field */
        if (likely(p))
                return -NFS4ERR_DENIED;
 out_overflow:
@@ -4200,12 +4495,9 @@ out_overflow:
 static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
 {
        struct xdr_buf  *rcvbuf = &req->rq_rcv_buf;
-        struct page     *page = *rcvbuf->pages;
        struct kvec     *iov = rcvbuf->head;
        size_t          hdrlen;
        u32             recvd, pglen = rcvbuf->page_len;
-        __be32          *end, *entry, *p, *kaddr;
-        unsigned int    nr = 0;
        int             status;
        status = decode_op_hdr(xdr, OP_READDIR);
@@ -4225,71 +4517,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
                pglen = recvd;
        xdr_read_pages(xdr, pglen);
-        BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE);
-        kaddr = p = kmap_atomic(page, KM_USER0);
-        end = p + ((pglen + readdir->pgbase) >> 2);
-        entry = p;
-        /* Make sure the packet actually has a value_follows and EOF entry */
-        if ((entry + 1) > end)
-                goto short_pkt;
-        for (; *p++; nr++) {
-                u32 len, attrlen, xlen;
-                if (end - p < 3)
-                        goto short_pkt;
-                dprintk("cookie = %Lu, ", *((unsigned long long *)p));
-                p += 2;                 /* cookie */
-                len = ntohl(*p++);      /* filename length */
-                if (len > NFS4_MAXNAMLEN) {
-                        dprintk("NFS: giant filename in readdir (len 0x%x)\n",
-                                        len);
-                        goto err_unmap;
-                }
-                xlen = XDR_QUADLEN(len);
-                if (end - p < xlen + 1)
-                        goto short_pkt;
-                dprintk("filename = %*s\n", len, (char *)p);
-                p += xlen;
-                len = ntohl(*p++);      /* bitmap length */
-                if (end - p < len + 1)
-                        goto short_pkt;
-                p += len;
-                attrlen = XDR_QUADLEN(ntohl(*p++));
-                if (end - p < attrlen + 2)
-                        goto short_pkt;
-                p += attrlen;           /* attributes */
-                entry = p;
-        }
-        /*
-         * Apparently some server sends responses that are a valid size, but
-         * contain no entries, and have value_follows==0 and EOF==0. For
-         * those, just set the EOF marker.
-         */
-        if (!nr && entry[1] == 0) {
-                dprintk("NFS: readdir reply truncated!\n");
-                entry[1] = 1;
-        }
-out:
-        kunmap_atomic(kaddr, KM_USER0);
        return 0;
-short_pkt:
-        /*
-         * When we get a short packet there are 2 possibilities. We can
-         * return an error, or fix up the response to look like a valid
-         * response and return what we have so far. If there are no
-         * entries and the packet was short, then return -EIO. If there
-         * are valid entries in the response, return them and pretend that
-         * the call was successful, but incomplete. The caller can retry the
-         * readdir starting at the last cookie.
-         */
-        dprintk("%s: short packet at entry %d\n", __func__, nr);
-        entry[0] = entry[1] = 0;
-        if (nr)
-                goto out;
-err_unmap:
-        kunmap_atomic(kaddr, KM_USER0);
-        return -errno_NFSERR_IO;
 }
 static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -4299,7 +4528,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
        size_t hdrlen;
        u32 len, recvd;
        __be32 *p;
-        char *kaddr;
        int status;
        status = decode_op_hdr(xdr, OP_READLINK);
@@ -4330,9 +4558,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
         * and and null-terminate the text (the VFS expects
         * null-termination).
         */
-        kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0);
+        xdr_terminate_string(rcvbuf, len);
-        kaddr[len+rcvbuf->page_base] = '\0';
-        kunmap_atomic(kaddr, KM_USER0);
        return 0;
 out_overflow:
        print_overflow_msg(__func__, xdr);
@@ -4668,7 +4894,6 @@ static int decode_sequence(struct xdr_stream *xdr,
                           struct rpc_rqst *rqstp)
 {
 #if defined(CONFIG_NFS_V4_1)
-        struct nfs4_slot *slot;
        struct nfs4_sessionid id;
        u32 dummy;
        int status;
@@ -4700,15 +4925,14 @@ static int decode_sequence(struct xdr_stream *xdr,
                goto out_overflow;
        /* seqid */
-        slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
        dummy = be32_to_cpup(p++);
-        if (dummy != slot->seq_nr) {
+        if (dummy != res->sr_slot->seq_nr) {
                dprintk("%s Invalid sequence number\n", __func__);
                goto out_err;
        }
        /* slot id */
        dummy = be32_to_cpup(p++);
-        if (dummy != res->sr_slotid) {
+        if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) {
                dprintk("%s Invalid slot id\n", __func__);
                goto out_err;
        }
@@ -4731,6 +4955,134 @@ out_overflow:
 #endif /* CONFIG_NFS_V4_1 */
 }
+#if defined(CONFIG_NFS_V4_1)
+static int decode_getdeviceinfo(struct xdr_stream *xdr,
+                                struct pnfs_device *pdev)
+{
+        __be32 *p;
+        uint32_t len, type;
+        int status;
+        status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
+        if (status) {
+                if (status == -ETOOSMALL) {
+                        p = xdr_inline_decode(xdr, 4);
+                        if (unlikely(!p))
+                                goto out_overflow;
+                        pdev->mincount = be32_to_cpup(p);
+                        dprintk("%s: Min count too small. mincnt = %u\n",
+                                __func__, pdev->mincount);
+                }
+                return status;
+        }
+        p = xdr_inline_decode(xdr, 8);
+        if (unlikely(!p))
+                goto out_overflow;
+        type = be32_to_cpup(p++);
+        if (type != pdev->layout_type) {
+                dprintk("%s: layout mismatch req: %u pdev: %u\n",
+                        __func__, pdev->layout_type, type);
+                return -EINVAL;
+        }
+        /*
+         * Get the length of the opaque device_addr4. xdr_read_pages places
+         * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
+         * and places the remaining xdr data in xdr_buf->tail
+         */
+        pdev->mincount = be32_to_cpup(p);
+        xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
+        /* Parse notification bitmap, verifying that it is zero. */
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        len = be32_to_cpup(p);
+        if (len) {
+                int i;
+                p = xdr_inline_decode(xdr, 4 * len);
+                if (unlikely(!p))
+                        goto out_overflow;
+                for (i = 0; i < len; i++, p++) {
+                        if (be32_to_cpup(p)) {
+                                dprintk("%s: notifications not supported\n",
+                                        __func__);
+                                return -EIO;
+                        }
+                }
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+                            struct nfs4_layoutget_res *res)
+{
+        __be32 *p;
+        int status;
+        u32 layout_count;
+        status = decode_op_hdr(xdr, OP_LAYOUTGET);
+        if (status)
+                return status;
+        p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
+        if (unlikely(!p))
+                goto out_overflow;
+        res->return_on_close = be32_to_cpup(p++);
+        p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
+        layout_count = be32_to_cpup(p);
+        if (!layout_count) {
+                dprintk("%s: server responded with empty layout array\n",
+                        __func__);
+                return -EINVAL;
+        }
+        p = xdr_inline_decode(xdr, 24);
+        if (unlikely(!p))
+                goto out_overflow;
+        p = xdr_decode_hyper(p, &res->range.offset);
+        p = xdr_decode_hyper(p, &res->range.length);
+        res->range.iomode = be32_to_cpup(p++);
+        res->type = be32_to_cpup(p++);
+        status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
+        if (unlikely(status))
+                return status;
+        dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
+                __func__,
+                (unsigned long)res->range.offset,
+                (unsigned long)res->range.length,
+                res->range.iomode,
+                res->type,
+                res->layout.len);
+        /* nfs4_proc_layoutget allocated a single page */
+        if (res->layout.len > PAGE_SIZE)
+                return -ENOMEM;
+        memcpy(res->layout.buf, p, res->layout.len);
+        if (layout_count > 1) {
+                /* We only handle a length one array at the moment.  Any
+                 * further entries are just ignored.  Note that this means
+                 * the client may see a response that is less than the
+                 * minimum it requested.
+                 */
+                dprintk("%s: server responded with %d layouts, dropping tail\n",
+                        __func__, layout_count);
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+#endif /* CONFIG_NFS_V4_1 */
 /*
 * END OF "GENERIC" DECODE ROUTINES.
 */
@@ -4873,7 +5225,7 @@ out:
 /*
 * Decode RENAME response
 */
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -5758,25 +6110,84 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
                status = decode_reclaim_complete(&xdr, (void *)NULL);
        return status;
 }
+/*
+ * Decode GETDEVINFO response
+ */
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
+                                      struct nfs4_getdeviceinfo_res *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status != 0)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status != 0)
+                goto out;
+        status = decode_getdeviceinfo(&xdr, res->pdev);
+out:
+        return status;
+}
+/*
+ * Decode LAYOUTGET response
+ */
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
+                                  struct nfs4_layoutget_res *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(&xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
+        status = decode_putfh(&xdr);
+        if (status)
+                goto out;
+        status = decode_layoutget(&xdr, rqstp, res);
+out:
+        return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
-__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+                           struct nfs_server *server, int plus)
 {
        uint32_t bitmap[2] = {0};
        uint32_t len;
+        __be32 *p = xdr_inline_decode(xdr, 4);
-        if (!*p++) {
+        if (unlikely(!p))
-                if (!*p)
+                goto out_overflow;
+        if (!ntohl(*p++)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                if (!ntohl(*p++))
                        return ERR_PTR(-EAGAIN);
                entry->eof = 1;
                return ERR_PTR(-EBADCOOKIE);
        }
+        p = xdr_inline_decode(xdr, 12);
+        if (unlikely(!p))
+                goto out_overflow;
        entry->prev_cookie = entry->cookie;
        p = xdr_decode_hyper(p, &entry->cookie);
        entry->len = ntohl(*p++);
+        p = xdr_inline_decode(xdr, entry->len);
+        if (unlikely(!p))
+                goto out_overflow;
        entry->name = (const char *) p;
-        p += XDR_QUADLEN(entry->len);
        /*
         * In case the server doesn't return an inode number,
@@ -5784,32 +6195,33 @@ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
         * since glibc seems to choke on it...)
         */
        entry->ino = 1;
+        entry->fattr->valid = 0;
-        len = ntohl(*p++);              /* bitmap length */
+        if (decode_attr_bitmap(xdr, bitmap) < 0)
-        if (len-- > 0) {
+                goto out_overflow;
-                bitmap[0] = ntohl(*p++);
-                if (len-- > 0) {
+        if (decode_attr_length(xdr, &len, &p) < 0)
-                        bitmap[1] = ntohl(*p++);
+                goto out_overflow;
-                        p += len;
-                }
+        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0)
-        }
+                goto out_overflow;
-        len = XDR_QUADLEN(ntohl(*p++)); /* attribute buffer length */
+        if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
-        if (len > 0) {
+                entry->ino = entry->fattr->fileid;
-                if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) {
-                        bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
+        if (verify_attr_len(xdr, p, len) < 0)
-                        /* Ignore the return value of rdattr_error for now */
+                goto out_overflow;
-                        p++;
-                        len--;
+        p = xdr_inline_peek(xdr, 8);
-                }
+        if (p != NULL)
-                if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID)
+                entry->eof = !p[0] && p[1];
-                        xdr_decode_hyper(p, &entry->ino);
+        else
-                else if (bitmap[0] == FATTR4_WORD0_FILEID)
+                entry->eof = 0;
-                        xdr_decode_hyper(p, &entry->ino);
-                p += len;
-        }
-        entry->eof = !p[0] && p[1];
        return p;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return ERR_PTR(-EIO);
 }
 /*
@@ -5936,6 +6348,8 @@ struct rpc_procinfo	nfs4_procedures[] = {
  PROC(SEQUENCE,        enc_sequence,   dec_sequence),
  PROC(GET_LEASE_TIME,  enc_get_lease_time,     dec_get_lease_time),
  PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
+  PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
+  PROC(LAYOUTGET,  enc_layoutget,     dec_layoutget),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index df101d9f546a..903908a20023 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -3,9 +3,10 @@
 *
 *  Allow an NFS filesystem to be mounted as root. The way this works is:
 *     (1) Use the IP autoconfig mechanism to set local IP addresses and routes.
- *     (2) Handle RPC negotiation with the system which replied to RARP or
+ *     (2) Construct the device string and the options string using DHCP
- *         was reported as a boot server by BOOTP or manually.
+ *         option 17 and/or kernel command line options.
- *     (3) The actual mounting is done later, when init() is running.
+ *     (3) When mount_root() sets up the root file system, pass these strings
+ *         to the NFS client's regular mount interface via sys_mount().
 *
 *
 *      Changes:
@@ -65,470 +66,245 @@
 *      Hua Qin         :       Support for mounting root file system via
 *                              NFS over TCP.
 *      Fabian Frederick:       Option parser rebuilt (using parser lib)
-*/
+ *      Chuck Lever     :       Use super.c's text-based mount option parsing
+ *      Chuck Lever     :       Add "nfsrootdebug".
+ */
 #include <linux/types.h>
 #include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/fs.h>
 #include <linux/init.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/xprtsock.h>
 #include <linux/nfs.h>
 #include <linux/nfs_fs.h>
-#include <linux/nfs_mount.h>
-#include <linux/in.h>
-#include <linux/major.h>
 #include <linux/utsname.h>
-#include <linux/inet.h>
 #include <linux/root_dev.h>
 #include <net/ipconfig.h>
-#include <linux/parser.h>
 #include "internal.h"
-/* Define this to allow debugging output */
-#undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
-/* Default port to use if server is not running a portmapper */
-#define NFS_MNT_PORT    627
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT                "/tftpboot/%s"
 /* Parameters passed from the kernel command line */
-static char nfs_root_name[256] __initdata = "";
+static char nfs_root_parms[256] __initdata = "";
+/* Text-based mount options passed to super.c */
+static char nfs_root_options[256] __initdata = "";
 /* Address of NFS server */
-static __be32 servaddr __initdata = 0;
+static __be32 servaddr __initdata = htonl(INADDR_NONE);
 /* Name of directory to mount */
-static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, };
+static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
-/* NFS-related data */
-static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
-static int nfs_port __initdata = 0;             /* Port to connect to for NFS */
-static int mount_port __initdata = 0;           /* Mount daemon port number */
-/***************************************************************************
-                             Parsing of options
- ***************************************************************************/
-enum {
-        /* Options that take integer arguments */
-        Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin,
-        Opt_acregmax, Opt_acdirmin, Opt_acdirmax,
-        /* Options that take no arguments */
-        Opt_soft, Opt_hard, Opt_intr,
-        Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, 
-        Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp,
-        Opt_acl, Opt_noacl,
-        /* Error token */
-        Opt_err
-};
-static const match_table_t tokens __initconst = {
-        {Opt_port, "port=%u"},
-        {Opt_rsize, "rsize=%u"},
-        {Opt_wsize, "wsize=%u"},
-        {Opt_timeo, "timeo=%u"},
-        {Opt_retrans, "retrans=%u"},
-        {Opt_acregmin, "acregmin=%u"},
-        {Opt_acregmax, "acregmax=%u"},
-        {Opt_acdirmin, "acdirmin=%u"},
-        {Opt_acdirmax, "acdirmax=%u"},
-        {Opt_soft, "soft"},
-        {Opt_hard, "hard"},
-        {Opt_intr, "intr"},
-        {Opt_nointr, "nointr"},
-        {Opt_posix, "posix"},
-        {Opt_noposix, "noposix"},
-        {Opt_cto, "cto"},
-        {Opt_nocto, "nocto"},
-        {Opt_ac, "ac"},
-        {Opt_noac, "noac"},
-        {Opt_lock, "lock"},
-        {Opt_nolock, "nolock"},
-        {Opt_v2, "nfsvers=2"},
-        {Opt_v2, "v2"},
-        {Opt_v3, "nfsvers=3"},
-        {Opt_v3, "v3"},
-        {Opt_udp, "proto=udp"},
-        {Opt_udp, "udp"},
-        {Opt_tcp, "proto=tcp"},
-        {Opt_tcp, "tcp"},
-        {Opt_acl, "acl"},
-        {Opt_noacl, "noacl"},
-        {Opt_err, NULL}
-        
-};
+/* server:export path string passed to super.c */
+static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
+#ifdef RPC_DEBUG
 /*
- *  Parse option string.
+ * When the "nfsrootdebug" kernel command line option is specified,
+ * enable debugging messages for NFSROOT.
 */
+static int __init nfs_root_debug(char *__unused)
-static int __init root_nfs_parse(char *name, char *buf)
 {
+        nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
-        char *p;
-        substring_t args[MAX_OPT_ARGS];
-        int option;
-        if (!name)
-                return 1;
-        /* Set the NFS remote path */
-        p = strsep(&name, ",");
-        if (p[0] != '\0' && strcmp(p, "default") != 0)
-                strlcpy(buf, p, NFS_MAXPATHLEN);
-        while ((p = strsep (&name, ",")) != NULL) {
-                int token; 
-                if (!*p)
-                        continue;
-                token = match_token(p, tokens, args);
-                /* %u tokens only. Beware if you add new tokens! */
-                if (token < Opt_soft && match_int(&args[0], &option))
-                        return 0;
-                switch (token) {
-                        case Opt_port:
-                                nfs_port = option;
-                                break;
-                        case Opt_rsize:
-                                nfs_data.rsize = option;
-                                break;
-                        case Opt_wsize:
-                                nfs_data.wsize = option;
-                                break;
-                        case Opt_timeo:
-                                nfs_data.timeo = option;
-                                break;
-                        case Opt_retrans:
-                                nfs_data.retrans = option;
-                                break;
-                        case Opt_acregmin:
-                                nfs_data.acregmin = option;
-                                break;
-                        case Opt_acregmax:
-                                nfs_data.acregmax = option;
-                                break;
-                        case Opt_acdirmin:
-                                nfs_data.acdirmin = option;
-                                break;
-                        case Opt_acdirmax:
-                                nfs_data.acdirmax = option;
-                                break;
-                        case Opt_soft:
-                                nfs_data.flags |= NFS_MOUNT_SOFT;
-                                break;
-                        case Opt_hard:
-                                nfs_data.flags &= ~NFS_MOUNT_SOFT;
-                                break;
-                        case Opt_intr:
-                        case Opt_nointr:
-                                break;
-                        case Opt_posix:
-                                nfs_data.flags |= NFS_MOUNT_POSIX;
-                                break;
-                        case Opt_noposix:
-                                nfs_data.flags &= ~NFS_MOUNT_POSIX;
-                                break;
-                        case Opt_cto:
-                                nfs_data.flags &= ~NFS_MOUNT_NOCTO;
-                                break;
-                        case Opt_nocto:
-                                nfs_data.flags |= NFS_MOUNT_NOCTO;
-                                break;
-                        case Opt_ac:
-                                nfs_data.flags &= ~NFS_MOUNT_NOAC;
-                                break;
-                        case Opt_noac:
-                                nfs_data.flags |= NFS_MOUNT_NOAC;
-                                break;
-                        case Opt_lock:
-                                nfs_data.flags &= ~NFS_MOUNT_NONLM;
-                                break;
-                        case Opt_nolock:
-                                nfs_data.flags |= NFS_MOUNT_NONLM;
-                                break;
-                        case Opt_v2:
-                                nfs_data.flags &= ~NFS_MOUNT_VER3;
-                                break;
-                        case Opt_v3:
-                                nfs_data.flags |= NFS_MOUNT_VER3;
-                                break;
-                        case Opt_udp:
-                                nfs_data.flags &= ~NFS_MOUNT_TCP;
-                                break;
-                        case Opt_tcp:
-                                nfs_data.flags |= NFS_MOUNT_TCP;
-                                break;
-                        case Opt_acl:
-                                nfs_data.flags &= ~NFS_MOUNT_NOACL;
-                                break;
-                        case Opt_noacl:
-                                nfs_data.flags |= NFS_MOUNT_NOACL;
-                                break;
-                        default:
-                                printk(KERN_WARNING "Root-NFS: unknown "
-                                        "option: %s\n", p);
-                                return 0;
-                }
-        }
        return 1;
 }
+__setup("nfsrootdebug", nfs_root_debug);
+#endif
 /*
- *  Prepare the NFS data structure and parse all options.
+ *  Parse NFS server and directory information passed on the kernel
+ *  command line.
+ *
+ *  nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
+ *
+ *  If there is a "%s" token in the <root-dir> string, it is replaced
+ *  by the ASCII-representation of the client's IP address.
 */
-static int __init root_nfs_name(char *name)
+static int __init nfs_root_setup(char *line)
 {
-        static char buf[NFS_MAXPATHLEN] __initdata;
+        ROOT_DEV = Root_NFS;
-        char *cp;
+        if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
-        /* Set some default values */
+                strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
-        memset(&nfs_data, 0, sizeof(nfs_data));
+        } else {
-        nfs_port          = -1;
+                size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
-        nfs_data.version  = NFS_MOUNT_VERSION;
+                if (n >= sizeof(nfs_root_parms))
-        nfs_data.flags    = NFS_MOUNT_NONLM;    /* No lockd in nfs root yet */
+                        line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0';
-        nfs_data.rsize    = NFS_DEF_FILE_IO_SIZE;
+                sprintf(nfs_root_parms, NFS_ROOT, line);
-        nfs_data.wsize    = NFS_DEF_FILE_IO_SIZE;
-        nfs_data.acregmin = NFS_DEF_ACREGMIN;
-        nfs_data.acregmax = NFS_DEF_ACREGMAX;
-        nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
-        nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
-        strcpy(buf, NFS_ROOT);
-        /* Process options received from the remote server */
-        root_nfs_parse(root_server_path, buf);
-        /* Override them by options set on kernel command-line */
-        root_nfs_parse(name, buf);
-        cp = utsname()->nodename;
-        if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) {
-                printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
-                return -1;
        }
-        sprintf(nfs_export_path, buf, cp);
+        /*
+         * Extract the IP address of the NFS server containing our
+         * root file system, if one was specified.
+         *
+         * Note: root_nfs_parse_addr() removes the server-ip from
+         *       nfs_root_parms, if it exists.
+         */
+        root_server_addr = root_nfs_parse_addr(nfs_root_parms);
        return 1;
 }
+__setup("nfsroot=", nfs_root_setup);
-/*
+static int __init root_nfs_copy(char *dest, const char *src,
- *  Get NFS server address.
+                                     const size_t destlen)
- */
-static int __init root_nfs_addr(void)
 {
-        if ((servaddr = root_server_addr) == htonl(INADDR_NONE)) {
+        if (strlcpy(dest, src, destlen) > destlen)
-                printk(KERN_ERR "Root-NFS: No NFS server available, giving up.\n");
                return -1;
-        }
+        return 0;
+}
-        snprintf(nfs_data.hostname, sizeof(nfs_data.hostname),
+static int __init root_nfs_cat(char *dest, const char *src,
-                 "%pI4", &servaddr);
+                                  const size_t destlen)
+{
+        if (strlcat(dest, src, destlen) > destlen)
+                return -1;
        return 0;
 }
 /*
- *  Tell the user what's going on.
+ * Parse out root export path and mount options from
+ * passed-in string @incoming.
+ *
+ * Copy the export path into @exppath.
 */
-#ifdef NFSROOT_DEBUG
+static int __init root_nfs_parse_options(char *incoming, char *exppath,
-static void __init root_nfs_print(void)
+                                         const size_t exppathlen)
 {
-        printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n",
+        char *p;
-                nfs_export_path, nfs_data.hostname);
-        printk(KERN_NOTICE "Root-NFS:     rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
-                nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
-        printk(KERN_NOTICE "Root-NFS:     acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
-                nfs_data.acregmin, nfs_data.acregmax,
-                nfs_data.acdirmin, nfs_data.acdirmax);
-        printk(KERN_NOTICE "Root-NFS:     nfsd port = %d, mountd port = %d, flags = %08x\n",
-                nfs_port, mount_port, nfs_data.flags);
-}
-#endif
-static int __init root_nfs_init(void)
+        /*
-{
+         * Set the NFS remote path
-#ifdef NFSROOT_DEBUG
+         */
-        nfs_debug |= NFSDBG_ROOT;
+        p = strsep(&incoming, ",");
-#endif
+        if (*p != '\0' && strcmp(p, "default") != 0)
+                if (root_nfs_copy(exppath, p, exppathlen))
+                        return -1;
        /*
-         * Decode the root directory path name and NFS options from
+         * @incoming now points to the rest of the string; if it
-         * the kernel command line. This has to go here in order to
+         * contains something, append it to our root options buffer
-         * be able to use the client IP address for the remote root
-         * directory (necessary for pure RARP booting).
         */
-        if (root_nfs_name(nfs_root_name) < 0 ||
+        if (incoming != NULL && *incoming != '\0')
-            root_nfs_addr() < 0)
+                if (root_nfs_cat(nfs_root_options, incoming,
-                return -1;
+                                                sizeof(nfs_root_options)))
+                        return -1;
-#ifdef NFSROOT_DEBUG
+        /*
-        root_nfs_print();
+         * Possibly prepare for more options to be appended
-#endif
+         */
+        if (nfs_root_options[0] != '\0' &&
+            nfs_root_options[strlen(nfs_root_options)] != ',')
+                if (root_nfs_cat(nfs_root_options, ",",
+                                                sizeof(nfs_root_options)))
+                        return -1;
        return 0;
 }
 /*
- *  Parse NFS server and directory information passed on the kernel
+ *  Decode the export directory path name and NFS options from
- *  command line.
+ *  the kernel command line.  This has to be done late in order to
+ *  use a dynamically acquired client IP address for the remote
+ *  root directory path.
+ *
+ *  Returns zero if successful; otherwise -1 is returned.
 */
-static int __init nfs_root_setup(char *line)
+static int __init root_nfs_data(char *cmdline)
 {
-        ROOT_DEV = Root_NFS;
+        char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
-        if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
+        int len, retval = -1;
-                strlcpy(nfs_root_name, line, sizeof(nfs_root_name));
+        char *tmp = NULL;
-        } else {
+        const size_t tmplen = sizeof(nfs_export_path);
-                int n = strlen(line) + sizeof(NFS_ROOT) - 1;
-                if (n >= sizeof(nfs_root_name))
+        tmp = kzalloc(tmplen, GFP_KERNEL);
-                        line[sizeof(nfs_root_name) - sizeof(NFS_ROOT) - 2] = '\0';
+        if (tmp == NULL)
-                sprintf(nfs_root_name, NFS_ROOT, line);
+                goto out_nomem;
+        strcpy(tmp, NFS_ROOT);
+        if (root_server_path[0] != '\0') {
+                dprintk("Root-NFS: DHCPv4 option 17: %s\n",
+                        root_server_path);
+                if (root_nfs_parse_options(root_server_path, tmp, tmplen))
+                        goto out_optionstoolong;
        }
-        root_server_addr = root_nfs_parse_addr(nfs_root_name);
-        return 1;
-}
-__setup("nfsroot=", nfs_root_setup);
-/***************************************************************************
-               Routines to actually mount the root directory
+        if (cmdline[0] != '\0') {
+                dprintk("Root-NFS: nfsroot=%s\n", cmdline);
+                if (root_nfs_parse_options(cmdline, tmp, tmplen))
+                        goto out_optionstoolong;
+        }
- ***************************************************************************/
+        /*
+         * Append mandatory options for nfsroot so they override
+         * what has come before
+         */
+        snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
+                        &servaddr);
+        if (root_nfs_cat(nfs_root_options, addr_option,
+                                                sizeof(nfs_root_options)))
+                goto out_optionstoolong;
-/*
+        /*
- *  Construct sockaddr_in from address and port number.
+         * Set up nfs_root_device.  For NFS mounts, this looks like
- */
+         *
-static inline void
+         *      server:/path
-set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port)
+         *
-{
+         * At this point, utsname()->nodename contains our local
-        sin->sin_family = AF_INET;
+         * IP address or hostname, set by ipconfig.  If "%s" exists
-        sin->sin_addr.s_addr = addr;
+         * in tmp, substitute the nodename, then shovel the whole
-        sin->sin_port = port;
+         * mess into nfs_root_device.
-}
+         */
+        len = snprintf(nfs_export_path, sizeof(nfs_export_path),
+                                tmp, utsname()->nodename);
+        if (len > (int)sizeof(nfs_export_path))
+                goto out_devnametoolong;
+        len = snprintf(nfs_root_device, sizeof(nfs_root_device),
+                                "%pI4:%s", &servaddr, nfs_export_path);
+        if (len > (int)sizeof(nfs_root_device))
+                goto out_devnametoolong;
-/*
+        retval = 0;
- *  Query server portmapper for the port of a daemon program.
- */
-static int __init root_nfs_getport(int program, int version, int proto)
-{
-        struct sockaddr_in sin;
-        printk(KERN_NOTICE "Looking up port of RPC %d/%d on %pI4\n",
+out:
-                program, version, &servaddr);
+        kfree(tmp);
-        set_sockaddr(&sin, servaddr, 0);
+        return retval;
-        return rpcb_getport_sync(&sin, program, version, proto);
+out_nomem:
+        printk(KERN_ERR "Root-NFS: could not allocate memory\n");
+        goto out;
+out_optionstoolong:
+        printk(KERN_ERR "Root-NFS: mount options string too long\n");
+        goto out;
+out_devnametoolong:
+        printk(KERN_ERR "Root-NFS: root device name too long.\n");
+        goto out;
 }
+/**
-/*
+ * nfs_root_data - Return prepared 'data' for NFSROOT mount
- *  Use portmapper to find mountd and nfsd port numbers if not overriden
+ * @root_device: OUT: address of string containing NFSROOT device
- *  by the user. Use defaults if portmapper is not available.
+ * @root_data: OUT: address of string containing NFSROOT mount options
- *  XXX: Is there any nfs server with no portmapper?
+ *
+ * Returns zero and sets @root_device and @root_data if successful,
+ * otherwise -1 is returned.
 */
-static int __init root_nfs_ports(void)
+int __init nfs_root_data(char **root_device, char **root_data)
 {
-        int port;
+        servaddr = root_server_addr;
-        int nfsd_ver, mountd_ver;
+        if (servaddr == htonl(INADDR_NONE)) {
-        int nfsd_port, mountd_port;
+                printk(KERN_ERR "Root-NFS: no NFS server address\n");
-        int proto;
+                return -1;
-        if (nfs_data.flags & NFS_MOUNT_VER3) {
-                nfsd_ver = NFS3_VERSION;
-                mountd_ver = NFS_MNT3_VERSION;
-                nfsd_port = NFS_PORT;
-                mountd_port = NFS_MNT_PORT;
-        } else {
-                nfsd_ver = NFS2_VERSION;
-                mountd_ver = NFS_MNT_VERSION;
-                nfsd_port = NFS_PORT;
-                mountd_port = NFS_MNT_PORT;
-        }
-        proto = (nfs_data.flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
-        if (nfs_port < 0) {
-                if ((port = root_nfs_getport(NFS_PROGRAM, nfsd_ver, proto)) < 0) {
-                        printk(KERN_ERR "Root-NFS: Unable to get nfsd port "
-                                        "number from server, using default\n");
-                        port = nfsd_port;
-                }
-                nfs_port = port;
-                dprintk("Root-NFS: Portmapper on server returned %d "
-                        "as nfsd port\n", port);
        }
-        if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) {
+        if (root_nfs_data(nfs_root_parms) < 0)
-                printk(KERN_ERR "Root-NFS: Unable to get mountd port "
+                return -1;
-                                "number from server, using default\n");
-                port = mountd_port;
-        }
-        mount_port = port;
-        dprintk("Root-NFS: mountd port is %d\n", port);
+        *root_device = nfs_root_device;
+        *root_data = nfs_root_options;
        return 0;
 }
-/*
- *  Get a file handle from the server for the directory which is to be
- *  mounted.
- */
-static int __init root_nfs_get_handle(void)
-{
-        struct sockaddr_in sin;
-        unsigned int auth_flav_len = 0;
-        struct nfs_mount_request request = {
-                .sap            = (struct sockaddr *)&sin,
-                .salen          = sizeof(sin),
-                .dirpath        = nfs_export_path,
-                .version        = (nfs_data.flags & NFS_MOUNT_VER3) ?
-                                        NFS_MNT3_VERSION : NFS_MNT_VERSION,
-                .protocol       = (nfs_data.flags & NFS_MOUNT_TCP) ?
-                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
-                .auth_flav_len  = &auth_flav_len,
-        };
-        int status = -ENOMEM;
-        request.fh = nfs_alloc_fhandle();
-        if (!request.fh)
-                goto out;
-        set_sockaddr(&sin, servaddr, htons(mount_port));
-        status = nfs_mount(&request);
-        if (status < 0)
-                printk(KERN_ERR "Root-NFS: Server returned error %d "
-                                "while mounting %s\n", status, nfs_export_path);
-        else {
-                nfs_data.root.size = request.fh->size;
-                memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
-        }
-        nfs_free_fhandle(request.fh);
-out:
-        return status;
-}
-/*
- *  Get the NFS port numbers and file handle, and return the prepared 'data'
- *  argument for mount() if everything went OK. Return NULL otherwise.
- */
-void * __init nfs_root_data(void)
-{
-        if (root_nfs_init() < 0
-         || root_nfs_ports() < 0
-         || root_nfs_get_handle() < 0)
-                return NULL;
-        set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port));
-        return (void*)&nfs_data;
-}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 919490232e17..137b549e63db 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -65,6 +65,13 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
        if (req == NULL)
                return ERR_PTR(-ENOMEM);
+        /* get lock context early so we can deal with alloc failures */
+        req->wb_lock_context = nfs_get_lock_context(ctx);
+        if (req->wb_lock_context == NULL) {
+                nfs_page_free(req);
+                return ERR_PTR(-ENOMEM);
+        }
        /* Initialize the request struct. Initially, we assume a
         * long write-back delay. This will be adjusted in
         * update_nfs_request below if the region is not locked. */
@@ -79,7 +86,6 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
        req->wb_pgbase  = offset;
        req->wb_bytes   = count;
        req->wb_context = get_nfs_open_context(ctx);
-        req->wb_lock_context = nfs_get_lock_context(ctx);
        kref_init(&req->wb_kref);
        return req;
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
new file mode 100644
index 000000000000..db773428f95f
--- /dev/null
+++ b/fs/nfs/pnfs.c
@@ -0,0 +1,783 @@
+/*
+ *  pNFS functions to call and manage layout drivers.
+ *
+ *  Copyright (c) 2002 [year of first publication]
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "pnfs.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS
+/* Locking:
+ *
+ * pnfs_spinlock:
+ *      protects pnfs_modules_tbl.
+ */
+static DEFINE_SPINLOCK(pnfs_spinlock);
+/*
+ * pnfs_modules_tbl holds all pnfs modules
+ */
+static LIST_HEAD(pnfs_modules_tbl);
+/* Return the registered pnfs layout driver module matching given id */
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver_locked(u32 id)
+{
+        struct pnfs_layoutdriver_type *local;
+        list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
+                if (local->id == id)
+                        goto out;
+        local = NULL;
+out:
+        dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
+        return local;
+}
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver(u32 id)
+{
+        struct pnfs_layoutdriver_type *local;
+        spin_lock(&pnfs_spinlock);
+        local = find_pnfs_driver_locked(id);
+        spin_unlock(&pnfs_spinlock);
+        return local;
+}
+void
+unset_pnfs_layoutdriver(struct nfs_server *nfss)
+{
+        if (nfss->pnfs_curr_ld) {
+                nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
+                module_put(nfss->pnfs_curr_ld->owner);
+        }
+        nfss->pnfs_curr_ld = NULL;
+}
+/*
+ * Try to set the server's pnfs module to the pnfs layout type specified by id.
+ * Currently only one pNFS layout driver per filesystem is supported.
+ *
+ * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+ */
+void
+set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
+{
+        struct pnfs_layoutdriver_type *ld_type = NULL;
+        if (id == 0)
+                goto out_no_driver;
+        if (!(server->nfs_client->cl_exchange_flags &
+                 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
+                printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
+                       id, server->nfs_client->cl_exchange_flags);
+                goto out_no_driver;
+        }
+        ld_type = find_pnfs_driver(id);
+        if (!ld_type) {
+                request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
+                ld_type = find_pnfs_driver(id);
+                if (!ld_type) {
+                        dprintk("%s: No pNFS module found for %u.\n",
+                                __func__, id);
+                        goto out_no_driver;
+                }
+        }
+        if (!try_module_get(ld_type->owner)) {
+                dprintk("%s: Could not grab reference on module\n", __func__);
+                goto out_no_driver;
+        }
+        server->pnfs_curr_ld = ld_type;
+        if (ld_type->set_layoutdriver(server)) {
+                printk(KERN_ERR
+                       "%s: Error initializing mount point for layout driver %u.\n",
+                       __func__, id);
+                module_put(ld_type->owner);
+                goto out_no_driver;
+        }
+        dprintk("%s: pNFS module for %u set\n", __func__, id);
+        return;
+out_no_driver:
+        dprintk("%s: Using NFSv4 I/O\n", __func__);
+        server->pnfs_curr_ld = NULL;
+}
+int
+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+        int status = -EINVAL;
+        struct pnfs_layoutdriver_type *tmp;
+        if (ld_type->id == 0) {
+                printk(KERN_ERR "%s id 0 is reserved\n", __func__);
+                return status;
+        }
+        if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
+                printk(KERN_ERR "%s Layout driver must provide "
+                       "alloc_lseg and free_lseg.\n", __func__);
+                return status;
+        }
+        spin_lock(&pnfs_spinlock);
+        tmp = find_pnfs_driver_locked(ld_type->id);
+        if (!tmp) {
+                list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
+                status = 0;
+                dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
+                        ld_type->name);
+        } else {
+                printk(KERN_ERR "%s Module with id %d already loaded!\n",
+                        __func__, ld_type->id);
+        }
+        spin_unlock(&pnfs_spinlock);
+        return status;
+}
+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
+void
+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+        dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
+        spin_lock(&pnfs_spinlock);
+        list_del(&ld_type->pnfs_tblid);
+        spin_unlock(&pnfs_spinlock);
+}
+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
+/*
+ * pNFS client layout cache
+ */
+static void
+get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+        assert_spin_locked(&lo->inode->i_lock);
+        lo->refcount++;
+}
+static void
+put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+        assert_spin_locked(&lo->inode->i_lock);
+        BUG_ON(lo->refcount == 0);
+        lo->refcount--;
+        if (!lo->refcount) {
+                dprintk("%s: freeing layout cache %p\n", __func__, lo);
+                BUG_ON(!list_empty(&lo->layouts));
+                NFS_I(lo->inode)->layout = NULL;
+                kfree(lo);
+        }
+}
+void
+put_layout_hdr(struct inode *inode)
+{
+        spin_lock(&inode->i_lock);
+        put_layout_hdr_locked(NFS_I(inode)->layout);
+        spin_unlock(&inode->i_lock);
+}
+static void
+init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+{
+        INIT_LIST_HEAD(&lseg->fi_list);
+        kref_init(&lseg->kref);
+        lseg->layout = lo;
+}
+/* Called without i_lock held, as the free_lseg call may sleep */
+static void
+destroy_lseg(struct kref *kref)
+{
+        struct pnfs_layout_segment *lseg =
+                container_of(kref, struct pnfs_layout_segment, kref);
+        struct inode *ino = lseg->layout->inode;
+        dprintk("--> %s\n", __func__);
+        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+        /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+        put_layout_hdr(ino);
+}
+static void
+put_lseg(struct pnfs_layout_segment *lseg)
+{
+        if (!lseg)
+                return;
+        dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+                atomic_read(&lseg->kref.refcount));
+        kref_put(&lseg->kref, destroy_lseg);
+}
+static void
+pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
+{
+        struct pnfs_layout_segment *lseg, *next;
+        struct nfs_client *clp;
+        dprintk("%s:Begin lo %p\n", __func__, lo);
+        assert_spin_locked(&lo->inode->i_lock);
+        list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
+                dprintk("%s: freeing lseg %p\n", __func__, lseg);
+                list_move(&lseg->fi_list, tmp_list);
+        }
+        clp = NFS_SERVER(lo->inode)->nfs_client;
+        spin_lock(&clp->cl_lock);
+        /* List does not take a reference, so no need for put here */
+        list_del_init(&lo->layouts);
+        spin_unlock(&clp->cl_lock);
+        write_seqlock(&lo->seqlock);
+        clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+        write_sequnlock(&lo->seqlock);
+        dprintk("%s:Return\n", __func__);
+}
+static void
+pnfs_free_lseg_list(struct list_head *tmp_list)
+{
+        struct pnfs_layout_segment *lseg;
+        while (!list_empty(tmp_list)) {
+                lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
+                                fi_list);
+                dprintk("%s calling put_lseg on %p\n", __func__, lseg);
+                list_del(&lseg->fi_list);
+                put_lseg(lseg);
+        }
+}
+void
+pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+        struct pnfs_layout_hdr *lo;
+        LIST_HEAD(tmp_list);
+        spin_lock(&nfsi->vfs_inode.i_lock);
+        lo = nfsi->layout;
+        if (lo) {
+                pnfs_clear_lseg_list(lo, &tmp_list);
+                /* Matched by refcount set to 1 in alloc_init_layout_hdr */
+                put_layout_hdr_locked(lo);
+        }
+        spin_unlock(&nfsi->vfs_inode.i_lock);
+        pnfs_free_lseg_list(&tmp_list);
+}
+/*
+ * Called by the state manger to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+        struct pnfs_layout_hdr *lo;
+        LIST_HEAD(tmp_list);
+        spin_lock(&clp->cl_lock);
+        list_splice_init(&clp->cl_layouts, &tmp_list);
+        spin_unlock(&clp->cl_lock);
+        while (!list_empty(&tmp_list)) {
+                lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
+                                layouts);
+                dprintk("%s freeing layout for inode %lu\n", __func__,
+                        lo->inode->i_ino);
+                pnfs_destroy_layout(NFS_I(lo->inode));
+        }
+}
+/* update lo->stateid with new if is more recent
+ *
+ * lo->stateid could be the open stateid, in which case we just use what given.
+ */
+static void
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+                        const nfs4_stateid *new)
+{
+        nfs4_stateid *old = &lo->stateid;
+        bool overwrite = false;
+        write_seqlock(&lo->seqlock);
+        if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
+            memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
+                overwrite = true;
+        else {
+                u32 oldseq, newseq;
+                oldseq = be32_to_cpu(old->stateid.seqid);
+                newseq = be32_to_cpu(new->stateid.seqid);
+                if ((int)(newseq - oldseq) > 0)
+                        overwrite = true;
+        }
+        if (overwrite)
+                memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
+        write_sequnlock(&lo->seqlock);
+}
+static void
+pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
+                              struct nfs4_state *state)
+{
+        int seq;
+        dprintk("--> %s\n", __func__);
+        write_seqlock(&lo->seqlock);
+        do {
+                seq = read_seqbegin(&state->seqlock);
+                memcpy(lo->stateid.data, state->stateid.data,
+                       sizeof(state->stateid.data));
+        } while (read_seqretry(&state->seqlock, seq));
+        set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+        write_sequnlock(&lo->seqlock);
+        dprintk("<-- %s\n", __func__);
+}
+void
+pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+                        struct nfs4_state *open_state)
+{
+        int seq;
+        dprintk("--> %s\n", __func__);
+        do {
+                seq = read_seqbegin(&lo->seqlock);
+                if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
+                        /* This will trigger retry of the read */
+                        pnfs_layout_from_open_stateid(lo, open_state);
+                } else
+                        memcpy(dst->data, lo->stateid.data,
+                               sizeof(lo->stateid.data));
+        } while (read_seqretry(&lo->seqlock, seq));
+        dprintk("<-- %s\n", __func__);
+}
+/*
+* Get layout from server.
+*    for now, assume that whole file layouts are requested.
+*    arg->offset: 0
+*    arg->length: all ones
+*/
+static struct pnfs_layout_segment *
+send_layoutget(struct pnfs_layout_hdr *lo,
+           struct nfs_open_context *ctx,
+           u32 iomode)
+{
+        struct inode *ino = lo->inode;
+        struct nfs_server *server = NFS_SERVER(ino);
+        struct nfs4_layoutget *lgp;
+        struct pnfs_layout_segment *lseg = NULL;
+        dprintk("--> %s\n", __func__);
+        BUG_ON(ctx == NULL);
+        lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
+        if (lgp == NULL) {
+                put_layout_hdr(lo->inode);
+                return NULL;
+        }
+        lgp->args.minlength = NFS4_MAX_UINT64;
+        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+        lgp->args.range.iomode = iomode;
+        lgp->args.range.offset = 0;
+        lgp->args.range.length = NFS4_MAX_UINT64;
+        lgp->args.type = server->pnfs_curr_ld->id;
+        lgp->args.inode = ino;
+        lgp->args.ctx = get_nfs_open_context(ctx);
+        lgp->lsegpp = &lseg;
+        /* Synchronously retrieve layout information from server and
+         * store in lseg.
+         */
+        nfs4_proc_layoutget(lgp);
+        if (!lseg) {
+                /* remember that LAYOUTGET failed and suspend trying */
+                set_bit(lo_fail_bit(iomode), &lo->state);
+        }
+        return lseg;
+}
+/*
+ * Compare two layout segments for sorting into layout cache.
+ * We want to preferentially return RW over RO layouts, so ensure those
+ * are seen first.
+ */
+static s64
+cmp_layout(u32 iomode1, u32 iomode2)
+{
+        /* read > read/write */
+        return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
+}
+static void
+pnfs_insert_layout(struct pnfs_layout_hdr *lo,
+                   struct pnfs_layout_segment *lseg)
+{
+        struct pnfs_layout_segment *lp;
+        int found = 0;
+        dprintk("%s:Begin\n", __func__);
+        assert_spin_locked(&lo->inode->i_lock);
+        if (list_empty(&lo->segs)) {
+                struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
+                spin_lock(&clp->cl_lock);
+                BUG_ON(!list_empty(&lo->layouts));
+                list_add_tail(&lo->layouts, &clp->cl_layouts);
+                spin_unlock(&clp->cl_lock);
+        }
+        list_for_each_entry(lp, &lo->segs, fi_list) {
+                if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
+                        continue;
+                list_add_tail(&lseg->fi_list, &lp->fi_list);
+                dprintk("%s: inserted lseg %p "
+                        "iomode %d offset %llu length %llu before "
+                        "lp %p iomode %d offset %llu length %llu\n",
+                        __func__, lseg, lseg->range.iomode,
+                        lseg->range.offset, lseg->range.length,
+                        lp, lp->range.iomode, lp->range.offset,
+                        lp->range.length);
+                found = 1;
+                break;
+        }
+        if (!found) {
+                list_add_tail(&lseg->fi_list, &lo->segs);
+                dprintk("%s: inserted lseg %p "
+                        "iomode %d offset %llu length %llu at tail\n",
+                        __func__, lseg, lseg->range.iomode,
+                        lseg->range.offset, lseg->range.length);
+        }
+        get_layout_hdr_locked(lo);
+        dprintk("%s:Return\n", __func__);
+}
+static struct pnfs_layout_hdr *
+alloc_init_layout_hdr(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo;
+        lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
+        if (!lo)
+                return NULL;
+        lo->refcount = 1;
+        INIT_LIST_HEAD(&lo->layouts);
+        INIT_LIST_HEAD(&lo->segs);
+        seqlock_init(&lo->seqlock);
+        lo->inode = ino;
+        return lo;
+}
+static struct pnfs_layout_hdr *
+pnfs_find_alloc_layout(struct inode *ino)
+{
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct pnfs_layout_hdr *new = NULL;
+        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
+        assert_spin_locked(&ino->i_lock);
+        if (nfsi->layout)
+                return nfsi->layout;
+        spin_unlock(&ino->i_lock);
+        new = alloc_init_layout_hdr(ino);
+        spin_lock(&ino->i_lock);
+        if (likely(nfsi->layout == NULL))       /* Won the race? */
+                nfsi->layout = new;
+        else
+                kfree(new);
+        return nfsi->layout;
+}
+/*
+ * iomode matching rules:
+ * iomode       lseg    match
+ * -----        -----   -----
+ * ANY          READ    true
+ * ANY          RW      true
+ * RW           READ    false
+ * RW           RW      true
+ * READ         READ    true
+ * READ         RW      true
+ */
+static int
+is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
+{
+        return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
+}
+/*
+ * lookup range in layout
+ */
+static struct pnfs_layout_segment *
+pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+        struct pnfs_layout_segment *lseg, *ret = NULL;
+        dprintk("%s:Begin\n", __func__);
+        assert_spin_locked(&lo->inode->i_lock);
+        list_for_each_entry(lseg, &lo->segs, fi_list) {
+                if (is_matching_lseg(lseg, iomode)) {
+                        ret = lseg;
+                        break;
+                }
+                if (cmp_layout(iomode, lseg->range.iomode) > 0)
+                        break;
+        }
+        dprintk("%s:Return lseg %p ref %d\n",
+                __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
+        return ret;
+}
+/*
+ * Layout segment is retreived from the server if not cached.
+ * The appropriate layout segment is referenced and returned to the caller.
+ */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino,
+                   struct nfs_open_context *ctx,
+                   enum pnfs_iomode iomode)
+{
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct pnfs_layout_hdr *lo;
+        struct pnfs_layout_segment *lseg = NULL;
+        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+                return NULL;
+        spin_lock(&ino->i_lock);
+        lo = pnfs_find_alloc_layout(ino);
+        if (lo == NULL) {
+                dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
+                goto out_unlock;
+        }
+        /* Check to see if the layout for the given range already exists */
+        lseg = pnfs_has_layout(lo, iomode);
+        if (lseg) {
+                dprintk("%s: Using cached lseg %p for iomode %d)\n",
+                        __func__, lseg, iomode);
+                goto out_unlock;
+        }
+        /* if LAYOUTGET already failed once we don't try again */
+        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
+                goto out_unlock;
+        get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
+        spin_unlock(&ino->i_lock);
+        lseg = send_layoutget(lo, ctx, iomode);
+out:
+        dprintk("%s end, state 0x%lx lseg %p\n", __func__,
+                nfsi->layout->state, lseg);
+        return lseg;
+out_unlock:
+        spin_unlock(&ino->i_lock);
+        goto out;
+}
+int
+pnfs_layout_process(struct nfs4_layoutget *lgp)
+{
+        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
+        struct nfs4_layoutget_res *res = &lgp->res;
+        struct pnfs_layout_segment *lseg;
+        struct inode *ino = lo->inode;
+        int status = 0;
+        /* Inject layout blob into I/O device driver */
+        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
+        if (!lseg || IS_ERR(lseg)) {
+                if (!lseg)
+                        status = -ENOMEM;
+                else
+                        status = PTR_ERR(lseg);
+                dprintk("%s: Could not allocate layout: error %d\n",
+                       __func__, status);
+                goto out;
+        }
+        spin_lock(&ino->i_lock);
+        init_lseg(lo, lseg);
+        lseg->range = res->range;
+        *lgp->lsegpp = lseg;
+        pnfs_insert_layout(lo, lseg);
+        /* Done processing layoutget. Set the layout stateid */
+        pnfs_set_layout_stateid(lo, &res->stateid);
+        spin_unlock(&ino->i_lock);
+out:
+        return status;
+}
+/*
+ * Device ID cache. Currently supports one layout type per struct nfs_client.
+ * Add layout type to the lookup key to expand to support multiple types.
+ */
+int
+pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
+                         void (*free_callback)(struct pnfs_deviceid_node *))
+{
+        struct pnfs_deviceid_cache *c;
+        c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
+        if (!c)
+                return -ENOMEM;
+        spin_lock(&clp->cl_lock);
+        if (clp->cl_devid_cache != NULL) {
+                atomic_inc(&clp->cl_devid_cache->dc_ref);
+                dprintk("%s [kref [%d]]\n", __func__,
+                        atomic_read(&clp->cl_devid_cache->dc_ref));
+                kfree(c);
+        } else {
+                /* kzalloc initializes hlists */
+                spin_lock_init(&c->dc_lock);
+                atomic_set(&c->dc_ref, 1);
+                c->dc_free_callback = free_callback;
+                clp->cl_devid_cache = c;
+                dprintk("%s [new]\n", __func__);
+        }
+        spin_unlock(&clp->cl_lock);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
+/*
+ * Called from pnfs_layoutdriver_type->free_lseg
+ * last layout segment reference frees deviceid
+ */
+void
+pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+                  struct pnfs_deviceid_node *devid)
+{
+        struct nfs4_deviceid *id = &devid->de_id;
+        struct pnfs_deviceid_node *d;
+        struct hlist_node *n;
+        long h = nfs4_deviceid_hash(id);
+        dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
+        if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
+                return;
+        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
+                if (!memcmp(&d->de_id, id, sizeof(*id))) {
+                        hlist_del_rcu(&d->de_node);
+                        spin_unlock(&c->dc_lock);
+                        synchronize_rcu();
+                        c->dc_free_callback(devid);
+                        return;
+                }
+        spin_unlock(&c->dc_lock);
+        /* Why wasn't it found in  the list? */
+        BUG();
+}
+EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
+/* Find and reference a deviceid */
+struct pnfs_deviceid_node *
+pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
+{
+        struct pnfs_deviceid_node *d;
+        struct hlist_node *n;
+        long hash = nfs4_deviceid_hash(id);
+        dprintk("--> %s hash %ld\n", __func__, hash);
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
+                if (!memcmp(&d->de_id, id, sizeof(*id))) {
+                        if (!atomic_inc_not_zero(&d->de_ref)) {
+                                goto fail;
+                        } else {
+                                rcu_read_unlock();
+                                return d;
+                        }
+                }
+        }
+fail:
+        rcu_read_unlock();
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
+/*
+ * Add a deviceid to the cache.
+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ */
+struct pnfs_deviceid_node *
+pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
+{
+        struct pnfs_deviceid_node *d;
+        long hash = nfs4_deviceid_hash(&new->de_id);
+        dprintk("--> %s hash %ld\n", __func__, hash);
+        spin_lock(&c->dc_lock);
+        d = pnfs_find_get_deviceid(c, &new->de_id);
+        if (d) {
+                spin_unlock(&c->dc_lock);
+                dprintk("%s [discard]\n", __func__);
+                c->dc_free_callback(new);
+                return d;
+        }
+        INIT_HLIST_NODE(&new->de_node);
+        atomic_set(&new->de_ref, 1);
+        hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
+        spin_unlock(&c->dc_lock);
+        dprintk("%s [new]\n", __func__);
+        return new;
+}
+EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
+void
+pnfs_put_deviceid_cache(struct nfs_client *clp)
+{
+        struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
+        dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
+        if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
+                int i;
+                /* Verify cache is empty */
+                for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
+                        BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
+                clp->cl_devid_cache = NULL;
+                spin_unlock(&clp->cl_lock);
+                kfree(local);
+        }
+}
+EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
new file mode 100644
index 000000000000..e12367d50489
--- /dev/null
+++ b/fs/nfs/pnfs.h
@@ -0,0 +1,189 @@
+/*
+ *  pNFS client data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#ifndef FS_NFS_PNFS_H
+#define FS_NFS_PNFS_H
+struct pnfs_layout_segment {
+        struct list_head fi_list;
+        struct pnfs_layout_range range;
+        struct kref kref;
+        struct pnfs_layout_hdr *layout;
+};
+#ifdef CONFIG_NFS_V4_1
+#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
+enum {
+        NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
+        NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
+        NFS_LAYOUT_STATEID_SET,         /* have a valid layout stateid */
+};
+/* Per-layout driver specific registration structure */
+struct pnfs_layoutdriver_type {
+        struct list_head pnfs_tblid;
+        const u32 id;
+        const char *name;
+        struct module *owner;
+        int (*set_layoutdriver) (struct nfs_server *);
+        int (*clear_layoutdriver) (struct nfs_server *);
+        struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
+        void (*free_lseg) (struct pnfs_layout_segment *lseg);
+};
+struct pnfs_layout_hdr {
+        unsigned long           refcount;
+        struct list_head        layouts;   /* other client layouts */
+        struct list_head        segs;      /* layout segments list */
+        seqlock_t               seqlock;   /* Protects the stateid */
+        nfs4_stateid            stateid;
+        unsigned long           state;
+        struct inode            *inode;
+};
+struct pnfs_device {
+        struct nfs4_deviceid dev_id;
+        unsigned int  layout_type;
+        unsigned int  mincount;
+        struct page **pages;
+        void          *area;
+        unsigned int  pgbase;
+        unsigned int  pglen;
+};
+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS        5
+#define NFS4_DEVICE_ID_HASH_SIZE        (1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK        (NFS4_DEVICE_ID_HASH_SIZE - 1)
+static inline u32
+nfs4_deviceid_hash(struct nfs4_deviceid *id)
+{
+        unsigned char *cptr = (unsigned char *)id->data;
+        unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+        u32 x = 0;
+        while (nbytes--) {
+                x *= 37;
+                x += *cptr++;
+        }
+        return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+struct pnfs_deviceid_node {
+        struct hlist_node       de_node;
+        struct nfs4_deviceid    de_id;
+        atomic_t                de_ref;
+};
+struct pnfs_deviceid_cache {
+        spinlock_t              dc_lock;
+        atomic_t                dc_ref;
+        void                    (*dc_free_callback)(struct pnfs_deviceid_node *);
+        struct hlist_head       dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
+};
+extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
+                        void (*free_callback)(struct pnfs_deviceid_node *));
+extern void pnfs_put_deviceid_cache(struct nfs_client *);
+extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
+                                struct pnfs_deviceid_cache *,
+                                struct nfs4_deviceid *);
+extern struct pnfs_deviceid_node *pnfs_add_deviceid(
+                                struct pnfs_deviceid_cache *,
+                                struct pnfs_deviceid_node *);
+extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+                              struct pnfs_deviceid_node *devid);
+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+/* nfs4proc.c */
+extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+                                   struct pnfs_device *dev);
+extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+/* pnfs.c */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+                   enum pnfs_iomode access_type);
+void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
+void unset_pnfs_layoutdriver(struct nfs_server *);
+int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_destroy_layout(struct nfs_inode *);
+void pnfs_destroy_all_layouts(struct nfs_client *);
+void put_layout_hdr(struct inode *inode);
+void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+                             struct nfs4_state *open_state);
+static inline int lo_fail_bit(u32 iomode)
+{
+        return iomode == IOMODE_RW ?
+                         NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
+}
+/* Return true if a layout driver is being used for this mountpoint */
+static inline int pnfs_enabled_sb(struct nfs_server *nfss)
+{
+        return nfss->pnfs_curr_ld != NULL;
+}
+#else  /* CONFIG_NFS_V4_1 */
+static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+}
+static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+}
+static inline struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+                   enum pnfs_iomode access_type)
+{
+        return NULL;
+}
+static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
+{
+}
+static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 611bec22f552..58e7f84fc1fd 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -258,7 +258,7 @@ static void nfs_free_createdata(const struct nfs_createdata *data)
 static int
 nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-                int flags, struct nameidata *nd)
+                int flags, struct nfs_open_context *ctx)
 {
        struct nfs_createdata *data;
        struct rpc_message msg = {
@@ -365,17 +365,32 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        return 1;
 }
+static void
+nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+{
+        msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
+}
+static int
+nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+                     struct inode *new_dir)
+{
+        if (nfs_async_handle_expired_key(task))
+                return 0;
+        nfs_mark_for_revalidate(old_dir);
+        nfs_mark_for_revalidate(new_dir);
+        return 1;
+}
 static int
 nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
                struct inode *new_dir, struct qstr *new_name)
 {
        struct nfs_renameargs   arg = {
-                .fromfh         = NFS_FH(old_dir),
+                .old_dir        = NFS_FH(old_dir),
-                .fromname       = old_name->name,
+                .old_name       = old_name,
-                .fromlen        = old_name->len,
+                .new_dir        = NFS_FH(new_dir),
-                .tofh           = NFS_FH(new_dir),
+                .new_name       = new_name,
-                .toname         = new_name->name,
-                .tolen          = new_name->len
        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_RENAME],
@@ -519,14 +534,14 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name)
 */
 static int
 nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
-                 u64 cookie, struct page *page, unsigned int count, int plus)
+                 u64 cookie, struct page **pages, unsigned int count, int plus)
 {
        struct inode            *dir = dentry->d_inode;
        struct nfs_readdirargs  arg = {
                .fh             = NFS_FH(dir),
                .cookie         = cookie,
                .count          = count,
-                .pages          = &page,
+                .pages          = pages,
        };
        struct rpc_message      msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_READDIR],
@@ -705,6 +720,8 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .unlink_setup   = nfs_proc_unlink_setup,
        .unlink_done    = nfs_proc_unlink_done,
        .rename         = nfs_proc_rename,
+        .rename_setup   = nfs_proc_rename_setup,
+        .rename_done    = nfs_proc_rename_done,
        .link           = nfs_proc_link,
        .symlink        = nfs_proc_symlink,
        .mkdir          = nfs_proc_mkdir,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 87adc2744246..e4b62c6f5a6e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -25,6 +25,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
@@ -46,7 +47,6 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
                p->npages = pagecount;
-                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
@@ -121,6 +121,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
        len = nfs_page_length(page);
        if (len == 0)
                return nfs_return_empty_page(page);
+        pnfs_update_layout(inode, ctx, IOMODE_READ);
        new = nfs_create_request(ctx, inode, page, 0, len);
        if (IS_ERR(new)) {
                unlock_page(page);
@@ -625,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        if (ret == 0)
                goto read_complete; /* all pages were read */
+        pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
        if (rsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
        else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f4cbf0c306c6..0a42e8f4adcb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -100,6 +100,7 @@ enum {
        Opt_addr, Opt_mountaddr, Opt_clientaddr,
        Opt_lookupcache,
        Opt_fscache_uniq,
+        Opt_local_lock,
        /* Special mount options */
        Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -171,6 +172,7 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_lookupcache, "lookupcache=%s" },
        { Opt_fscache_uniq, "fsc=%s" },
+        { Opt_local_lock, "local_lock=%s" },
        { Opt_err, NULL }
 };
@@ -236,14 +238,30 @@ static match_table_t nfs_lookupcache_tokens = {
        { Opt_lookupcache_err, NULL }
 };
+enum {
+        Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix,
+        Opt_local_lock_none,
+        Opt_local_lock_err
+};
+static match_table_t nfs_local_lock_tokens = {
+        { Opt_local_lock_all, "all" },
+        { Opt_local_lock_flock, "flock" },
+        { Opt_local_lock_posix, "posix" },
+        { Opt_local_lock_none, "none" },
+        { Opt_local_lock_err, NULL }
+};
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
-static int nfs_xdev_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
-                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+                int flags, const char *dev_name, void *raw_data);
 static void nfs_put_super(struct super_block *);
 static void nfs_kill_super(struct super_block *);
 static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
@@ -259,7 +277,7 @@ static struct file_system_type nfs_fs_type = {
 struct file_system_type nfs_xdev_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs",
-        .get_sb         = nfs_xdev_get_sb,
+        .mount          = nfs_xdev_mount,
        .kill_sb        = nfs_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -284,14 +302,14 @@ static int nfs4_try_mount(int flags, const char *dev_name,
        struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
 static int nfs4_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs4_remote_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
-static int nfs4_xdev_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
 static int nfs4_referral_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
 static void nfs4_kill_super(struct super_block *sb);
 static struct file_system_type nfs4_fs_type = {
@@ -305,7 +323,7 @@ static struct file_system_type nfs4_fs_type = {
 static struct file_system_type nfs4_remote_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_remote_get_sb,
+        .mount          = nfs4_remote_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -313,7 +331,7 @@ static struct file_system_type nfs4_remote_fs_type = {
 struct file_system_type nfs4_xdev_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_xdev_get_sb,
+        .mount          = nfs4_xdev_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -321,7 +339,7 @@ struct file_system_type nfs4_xdev_fs_type = {
 static struct file_system_type nfs4_remote_referral_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_remote_referral_get_sb,
+        .mount          = nfs4_remote_referral_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -622,6 +640,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        const struct proc_nfs_info *nfs_infop;
        struct nfs_client *clp = nfss->nfs_client;
        u32 version = clp->rpc_ops->version;
+        int local_flock, local_fcntl;
        seq_printf(m, ",vers=%u", version);
        seq_printf(m, ",rsize=%u", nfss->rsize);
@@ -670,6 +689,18 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                else
                        seq_printf(m, ",lookupcache=pos");
        }
+        local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
+        local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
+        if (!local_flock && !local_fcntl)
+                seq_printf(m, ",local_lock=none");
+        else if (local_flock && local_fcntl)
+                seq_printf(m, ",local_lock=all");
+        else if (local_flock)
+                seq_printf(m, ",local_lock=flock");
+        else
+                seq_printf(m, ",local_lock=posix");
 }
 /*
@@ -1017,9 +1048,13 @@ static int nfs_parse_mount_options(char *raw,
                        break;
                case Opt_lock:
                        mnt->flags &= ~NFS_MOUNT_NONLM;
+                        mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+                                        NFS_MOUNT_LOCAL_FCNTL);
                        break;
                case Opt_nolock:
                        mnt->flags |= NFS_MOUNT_NONLM;
+                        mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+                                       NFS_MOUNT_LOCAL_FCNTL);
                        break;
                case Opt_v2:
                        mnt->flags &= ~NFS_MOUNT_VER3;
@@ -1420,6 +1455,34 @@ static int nfs_parse_mount_options(char *raw,
                        mnt->fscache_uniq = string;
                        mnt->options |= NFS_OPTION_FSCACHE;
                        break;
+                case Opt_local_lock:
+                        string = match_strdup(args);
+                        if (string == NULL)
+                                goto out_nomem;
+                        token = match_token(string, nfs_local_lock_tokens,
+                                        args);
+                        kfree(string);
+                        switch (token) {
+                        case Opt_local_lock_all:
+                                mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+                                               NFS_MOUNT_LOCAL_FCNTL);
+                                break;
+                        case Opt_local_lock_flock:
+                                mnt->flags |= NFS_MOUNT_LOCAL_FLOCK;
+                                break;
+                        case Opt_local_lock_posix:
+                                mnt->flags |= NFS_MOUNT_LOCAL_FCNTL;
+                                break;
+                        case Opt_local_lock_none:
+                                mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+                                                NFS_MOUNT_LOCAL_FCNTL);
+                                break;
+                        default:
+                                dfprintk(MOUNT, "NFS:   invalid "
+                                                "local_lock argument\n");
+                                return 0;
+                        };
+                        break;
                /*
                 * Special options
@@ -1825,6 +1888,12 @@ static int nfs_validate_mount_data(void *options,
                if (!args->nfs_server.hostname)
                        goto out_nomem;
+                if (!(data->flags & NFS_MOUNT_NONLM))
+                        args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK|
+                                         NFS_MOUNT_LOCAL_FCNTL);
+                else
+                        args->flags |= (NFS_MOUNT_LOCAL_FLOCK|
+                                        NFS_MOUNT_LOCAL_FCNTL);
                /*
                 * The legacy version 6 binary mount data from userspace has a
                 * field used only to transport selinux information into the
@@ -2328,9 +2397,9 @@ static void nfs_kill_super(struct super_block *s)
 /*
 * Clone an NFS2/3 server record on xdev traversal (FSID-change)
 */
-static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *
-                           const char *dev_name, void *raw_data,
+nfs_xdev_mount(struct file_system_type *fs_type, int flags,
-                           struct vfsmount *mnt)
+                const char *dev_name, void *raw_data)
 {
        struct nfs_clone_mount *data = raw_data;
        struct super_block *s;
@@ -2342,7 +2411,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
        };
        int error;
-        dprintk("--> nfs_xdev_get_sb()\n");
+        dprintk("--> nfs_xdev_mount()\n");
        /* create a new volume representation */
        server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2389,28 +2458,26 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
        }
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
        /* clone any lsm security options from the parent to the new sb */
        security_sb_clone_mnt_opts(data->sb, s);
-        dprintk("<-- nfs_xdev_get_sb() = 0\n");
+        dprintk("<-- nfs_xdev_mount() = 0\n");
-        return 0;
+        return mntroot;
 out_err_nosb:
        nfs_free_server(server);
 out_err_noserver:
-        dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error);
+        dprintk("<-- nfs_xdev_mount() = %d [error]\n", error);
-        return error;
+        return ERR_PTR(error);
 error_splat_super:
        if (server && !s->s_root)
                bdi_unregister(&server->backing_dev_info);
 error_splat_bdi:
        deactivate_locked_super(s);
-        dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
+        dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error);
-        return error;
+        return ERR_PTR(error);
 }
 #ifdef CONFIG_NFS_V4
@@ -2441,7 +2508,8 @@ static void nfs4_fill_super(struct super_block *sb)
 static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
 {
-        args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
+        args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3|
+                         NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);
 }
 static int nfs4_validate_text_mount_data(void *options,
@@ -2579,8 +2647,9 @@ out_no_address:
 /*
 * Get the superblock for the NFS4 root partition
 */
-static int nfs4_remote_get_sb(struct file_system_type *fs_type,
+static struct dentry *
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+nfs4_remote_mount(struct file_system_type *fs_type, int flags,
+                  const char *dev_name, void *raw_data)
 {
        struct nfs_parsed_mount_data *data = raw_data;
        struct super_block *s;
@@ -2644,15 +2713,16 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
                goto error_splat_root;
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
+        security_free_mnt_opts(&data->lsm_opts);
-        error = 0;
+        nfs_free_fhandle(mntfh);
+        return mntroot;
 out:
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
        nfs_free_fhandle(mntfh);
-        return error;
+        return ERR_PTR(error);
 out_free:
        nfs_free_server(server);
@@ -2898,9 +2968,9 @@ static void nfs4_kill_super(struct super_block *sb)
 /*
 * Clone an NFS4 server record on xdev traversal (FSID-change)
 */
-static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *
-                            const char *dev_name, void *raw_data,
+nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
-                            struct vfsmount *mnt)
+                 const char *dev_name, void *raw_data)
 {
        struct nfs_clone_mount *data = raw_data;
        struct super_block *s;
@@ -2912,7 +2982,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
        };
        int error;
-        dprintk("--> nfs4_xdev_get_sb()\n");
+        dprintk("--> nfs4_xdev_mount()\n");
        /* create a new volume representation */
        server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
@@ -2959,32 +3029,30 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
        }
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
        security_sb_clone_mnt_opts(data->sb, s);
-        dprintk("<-- nfs4_xdev_get_sb() = 0\n");
+        dprintk("<-- nfs4_xdev_mount() = 0\n");
-        return 0;
+        return mntroot;
 out_err_nosb:
        nfs_free_server(server);
 out_err_noserver:
-        dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error);
+        dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error);
-        return error;
+        return ERR_PTR(error);
 error_splat_super:
        if (server && !s->s_root)
                bdi_unregister(&server->backing_dev_info);
 error_splat_bdi:
        deactivate_locked_super(s);
-        dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
+        dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error);
-        return error;
+        return ERR_PTR(error);
 }
-static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
+static struct dentry *
-                int flags, const char *dev_name, void *raw_data,
+nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
-                struct vfsmount *mnt)
+                           const char *dev_name, void *raw_data)
 {
        struct nfs_clone_mount *data = raw_data;
        struct super_block *s;
@@ -3048,14 +3116,12 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
        }
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
        security_sb_clone_mnt_opts(data->sb, s);
        nfs_free_fhandle(mntfh);
        dprintk("<-- nfs4_referral_get_sb() = 0\n");
-        return 0;
+        return mntroot;
 out_err_nosb:
        nfs_free_server(server);
@@ -3063,7 +3129,7 @@ out_err_noserver:
        nfs_free_fhandle(mntfh);
 out_err_nofh:
        dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
-        return error;
+        return ERR_PTR(error);
 error_splat_super:
        if (server && !s->s_root)
@@ -3072,7 +3138,7 @@ error_splat_bdi:
        deactivate_locked_super(s);
        nfs_free_fhandle(mntfh);
        dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
-        return error;
+        return ERR_PTR(error);
 }
 /*
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index ad4d2e787b20..978aaeb8a093 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,6 +32,7 @@ static ctl_table nfs_cb_sysctls[] = {
                .extra1 = (int *)&nfs_set_port_min,
                .extra2 = (int *)&nfs_set_port_max,
        },
+#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
        {
                .procname = "idmap_cache_timeout",
                .data = &nfs_idmap_cache_timeout,
@@ -39,6 +40,7 @@ static ctl_table nfs_cb_sysctls[] = {
                .mode = 0644,
                .proc_handler = proc_dointvec_jiffies,
        },
+#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
 #endif
        {
                .procname       = "nfs_mountpoint_timeout",
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 2f84adaad427..7bdec8531400 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -13,9 +13,12 @@
 #include <linux/nfs_fs.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
+#include <linux/namei.h>
 #include "internal.h"
 #include "nfs4_fs.h"
+#include "iostat.h"
+#include "delegation.h"
 struct nfs_unlinkdata {
        struct hlist_node list;
@@ -244,7 +247,7 @@ void nfs_unblock_sillyrename(struct dentry *dentry)
 * @dir: parent directory of dentry
 * @dentry: dentry to unlink
 */
-int
+static int
 nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct nfs_unlinkdata *data;
@@ -259,7 +262,6 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
                status = PTR_ERR(data->cred);
                goto out_free;
        }
-        data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        data->res.dir_attr = &data->dir_attr;
        status = -EBUSY;
@@ -303,3 +305,256 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
        if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
                nfs_free_unlinkdata(data);
 }
+/* Cancel a queued async unlink. Called when a sillyrename run fails. */
+static void
+nfs_cancel_async_unlink(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+                struct nfs_unlinkdata *data = dentry->d_fsdata;
+                dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+                spin_unlock(&dentry->d_lock);
+                nfs_free_unlinkdata(data);
+                return;
+        }
+        spin_unlock(&dentry->d_lock);
+}
+struct nfs_renamedata {
+        struct nfs_renameargs   args;
+        struct nfs_renameres    res;
+        struct rpc_cred         *cred;
+        struct inode            *old_dir;
+        struct dentry           *old_dentry;
+        struct nfs_fattr        old_fattr;
+        struct inode            *new_dir;
+        struct dentry           *new_dentry;
+        struct nfs_fattr        new_fattr;
+};
+/**
+ * nfs_async_rename_done - Sillyrename post-processing
+ * @task: rpc_task of the sillyrename
+ * @calldata: nfs_renamedata for the sillyrename
+ *
+ * Do the directory attribute updates and the d_move
+ */
+static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs_renamedata *data = calldata;
+        struct inode *old_dir = data->old_dir;
+        struct inode *new_dir = data->new_dir;
+        if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
+                nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
+                return;
+        }
+        if (task->tk_status != 0) {
+                nfs_cancel_async_unlink(data->old_dentry);
+                return;
+        }
+        nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir));
+        d_move(data->old_dentry, data->new_dentry);
+}
+/**
+ * nfs_async_rename_release - Release the sillyrename data.
+ * @calldata: the struct nfs_renamedata to be released
+ */
+static void nfs_async_rename_release(void *calldata)
+{
+        struct nfs_renamedata   *data = calldata;
+        struct super_block *sb = data->old_dir->i_sb;
+        if (data->old_dentry->d_inode)
+                nfs_mark_for_revalidate(data->old_dentry->d_inode);
+        dput(data->old_dentry);
+        dput(data->new_dentry);
+        iput(data->old_dir);
+        iput(data->new_dir);
+        nfs_sb_deactive(sb);
+        put_rpccred(data->cred);
+        kfree(data);
+}
+#if defined(CONFIG_NFS_V4_1)
+static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs_renamedata *data = calldata;
+        struct nfs_server *server = NFS_SERVER(data->old_dir);
+        if (nfs4_setup_sequence(server, &data->args.seq_args,
+                                &data->res.seq_res, 1, task))
+                return;
+        rpc_call_start(task);
+}
+#endif /* CONFIG_NFS_V4_1 */
+static const struct rpc_call_ops nfs_rename_ops = {
+        .rpc_call_done = nfs_async_rename_done,
+        .rpc_release = nfs_async_rename_release,
+#if defined(CONFIG_NFS_V4_1)
+        .rpc_call_prepare = nfs_rename_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+};
+/**
+ * nfs_async_rename - perform an asynchronous rename operation
+ * @old_dir: directory that currently holds the dentry to be renamed
+ * @new_dir: target directory for the rename
+ * @old_dentry: original dentry to be renamed
+ * @new_dentry: dentry to which the old_dentry should be renamed
+ *
+ * It's expected that valid references to the dentries and inodes are held
+ */
+static struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+                 struct dentry *old_dentry, struct dentry *new_dentry)
+{
+        struct nfs_renamedata *data;
+        struct rpc_message msg = { };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_message = &msg,
+                .callback_ops = &nfs_rename_ops,
+                .workqueue = nfsiod_workqueue,
+                .rpc_client = NFS_CLIENT(old_dir),
+                .flags = RPC_TASK_ASYNC,
+        };
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        if (data == NULL)
+                return ERR_PTR(-ENOMEM);
+        task_setup_data.callback_data = data,
+        data->cred = rpc_lookup_cred();
+        if (IS_ERR(data->cred)) {
+                struct rpc_task *task = ERR_CAST(data->cred);
+                kfree(data);
+                return task;
+        }
+        msg.rpc_argp = &data->args;
+        msg.rpc_resp = &data->res;
+        msg.rpc_cred = data->cred;
+        /* set up nfs_renamedata */
+        data->old_dir = old_dir;
+        ihold(old_dir);
+        data->new_dir = new_dir;
+        ihold(new_dir);
+        data->old_dentry = dget(old_dentry);
+        data->new_dentry = dget(new_dentry);
+        nfs_fattr_init(&data->old_fattr);
+        nfs_fattr_init(&data->new_fattr);
+        /* set up nfs_renameargs */
+        data->args.old_dir = NFS_FH(old_dir);
+        data->args.old_name = &old_dentry->d_name;
+        data->args.new_dir = NFS_FH(new_dir);
+        data->args.new_name = &new_dentry->d_name;
+        /* set up nfs_renameres */
+        data->res.old_fattr = &data->old_fattr;
+        data->res.new_fattr = &data->new_fattr;
+        nfs_sb_active(old_dir->i_sb);
+        NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir);
+        return rpc_run_task(&task_setup_data);
+}
+/**
+ * nfs_sillyrename - Perform a silly-rename of a dentry
+ * @dir: inode of directory that contains dentry
+ * @dentry: dentry to be sillyrenamed
+ *
+ * NFSv2/3 is stateless and the server doesn't know when the client is
+ * holding a file open. To prevent application problems when a file is
+ * unlinked while it's still open, the client performs a "silly-rename".
+ * That is, it renames the file to a hidden file in the same directory,
+ * and only performs the unlink once the last reference to it is put.
+ *
+ * The final cleanup is done during dentry_iput.
+ */
+int
+nfs_sillyrename(struct inode *dir, struct dentry *dentry)
+{
+        static unsigned int sillycounter;
+        const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2;
+        const int      countersize = sizeof(sillycounter)*2;
+        const int      slen        = sizeof(".nfs")+fileidsize+countersize-1;
+        char           silly[slen+1];
+        struct dentry *sdentry;
+        struct rpc_task *task;
+        int            error = -EIO;
+        dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
+                dentry->d_parent->d_name.name, dentry->d_name.name,
+                atomic_read(&dentry->d_count));
+        nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
+        /*
+         * We don't allow a dentry to be silly-renamed twice.
+         */
+        error = -EBUSY;
+        if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+                goto out;
+        sprintf(silly, ".nfs%*.*Lx",
+                fileidsize, fileidsize,
+                (unsigned long long)NFS_FILEID(dentry->d_inode));
+        /* Return delegation in anticipation of the rename */
+        nfs_inode_return_delegation(dentry->d_inode);
+        sdentry = NULL;
+        do {
+                char *suffix = silly + slen - countersize;
+                dput(sdentry);
+                sillycounter++;
+                sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
+                dfprintk(VFS, "NFS: trying to rename %s to %s\n",
+                                dentry->d_name.name, silly);
+                sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+                /*
+                 * N.B. Better to return EBUSY here ... it could be
+                 * dangerous to delete the file while it's in use.
+                 */
+                if (IS_ERR(sdentry))
+                        goto out;
+        } while (sdentry->d_inode != NULL); /* need negative lookup */
+        /* queue unlink first. Can't do this from rpc_release as it
+         * has to allocate memory
+         */
+        error = nfs_async_unlink(dir, dentry);
+        if (error)
+                goto out_dput;
+        /* run the rename task, undo unlink if it fails */
+        task = nfs_async_rename(dir, dir, dentry, sdentry);
+        if (IS_ERR(task)) {
+                error = -EBUSY;
+                nfs_cancel_async_unlink(dentry);
+                goto out_dput;
+        }
+        /* wait for the RPC task to complete, unless a SIGKILL intervenes */
+        error = rpc_wait_for_completion_task(task);
+        if (error == 0)
+                error = task->tk_status;
+        rpc_put_task(task);
+out_dput:
+        dput(sdentry);
+out:
+        return error;
+}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 874972d9427c..4c14c17a5276 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -55,7 +55,6 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
-                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
        }
        return p;
 }
@@ -75,7 +74,6 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
                p->npages = pagecount;
-                p->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
@@ -292,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
        nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
        nfs_pageio_cond_complete(pgio, page->index);
-        ret = nfs_page_async_flush(pgio, page,
+        ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
-                        wbc->sync_mode == WB_SYNC_NONE ||
-                        wbc->nonblocking != 0);
        if (ret == -EAGAIN) {
                redirty_page_for_writepage(wbc, page);
                ret = 0;
@@ -1433,15 +1429,17 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
        int flags = FLUSH_SYNC;
        int ret = 0;
-        /* Don't commit yet if this is a non-blocking flush and there are
+        if (wbc->sync_mode == WB_SYNC_NONE) {
-         * lots of outstanding writes for this mapping.
+                /* Don't commit yet if this is a non-blocking flush and there
-         */
+                 * are a lot of outstanding writes for this mapping.
-        if (wbc->sync_mode == WB_SYNC_NONE &&
+                 */
-            nfsi->ncommit <= (nfsi->npages >> 1))
+                if (nfsi->ncommit <= (nfsi->npages >> 1))
-                goto out_mark_dirty;
+                        goto out_mark_dirty;
-        if (wbc->nonblocking || wbc->for_background)
+                /* don't wait for the COMMIT response */
                flags = 0;
+        }
        ret = nfs_commit_inode(inode, flags);
        if (ret >= 0) {
                if (wbc->sync_mode == WB_SYNC_NONE) {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 4264377552e2..18b3e8975fe0 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -28,6 +28,18 @@ config NFSD
          If unsure, say N.
+config NFSD_DEPRECATED
+        bool "Include support for deprecated syscall interface to NFSD"
+        depends on NFSD
+        default y
+        help
+          The syscall interface to nfsd was obsoleted in 2.6.0 by a new
+          filesystem based interface.  The old interface is due for removal
+          in 2.6.40.  If you wish to remove the interface before then
+          say N.
+          In unsure, say Y.
 config NFSD_V2_ACL
        bool
        depends on NFSD
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c2a4f71d87dd..c0fcb7ab7f6d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -28,9 +28,6 @@
 typedef struct auth_domain      svc_client;
 typedef struct svc_export       svc_export;
-static void             exp_do_unexport(svc_export *unexp);
-static int              exp_verify_string(char *cp, int max);
 /*
 * We have two caches.
 * One maps client+vfsmnt+dentry to export options - the export map
@@ -802,6 +799,7 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
        return ek;
 }
+#ifdef CONFIG_NFSD_DEPRECATED
 static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
                       struct svc_export *exp)
 {
@@ -852,6 +850,7 @@ exp_get_fsid_key(svc_client *clp, int fsid)
        return exp_find_key(clp, FSID_NUM, fsidv, NULL);
 }
+#endif
 static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
                                     struct cache_req *reqp)
@@ -893,6 +892,7 @@ static struct svc_export *exp_parent(svc_client *clp, struct path *path)
        return exp;
 }
+#ifdef CONFIG_NFSD_DEPRECATED
 /*
 * Hashtable locking. Write locks are placed only by user processes
 * wanting to modify export information.
@@ -925,6 +925,19 @@ exp_writeunlock(void)
 {
        up_write(&hash_sem);
 }
+#else
+/* hash_sem not needed once deprecated interface is removed */
+void exp_readlock(void) {}
+static inline void exp_writelock(void){}
+void exp_readunlock(void) {}
+static inline void exp_writeunlock(void){}
+#endif
+#ifdef CONFIG_NFSD_DEPRECATED
+static void             exp_do_unexport(svc_export *unexp);
+static int              exp_verify_string(char *cp, int max);
 static void exp_fsid_unhash(struct svc_export *exp)
 {
@@ -935,10 +948,9 @@ static void exp_fsid_unhash(struct svc_export *exp)
        ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
        if (!IS_ERR(ek)) {
-                ek->h.expiry_time = get_seconds()-1;
+                sunrpc_invalidate(&ek->h, &svc_expkey_cache);
                cache_put(&ek->h, &svc_expkey_cache);
        }
-        svc_expkey_cache.nextcheck = get_seconds();
 }
 static int exp_fsid_hash(svc_client *clp, struct svc_export *exp)
@@ -973,10 +985,9 @@ static void exp_unhash(struct svc_export *exp)
        ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
        if (!IS_ERR(ek)) {
-                ek->h.expiry_time = get_seconds()-1;
+                sunrpc_invalidate(&ek->h, &svc_expkey_cache);
                cache_put(&ek->h, &svc_expkey_cache);
        }
-        svc_expkey_cache.nextcheck = get_seconds();
 }
        
 /*
@@ -1097,8 +1108,7 @@ out:
 static void
 exp_do_unexport(svc_export *unexp)
 {
-        unexp->h.expiry_time = get_seconds()-1;
+        sunrpc_invalidate(&unexp->h, &svc_export_cache);
-        svc_export_cache.nextcheck = get_seconds();
        exp_unhash(unexp);
        exp_fsid_unhash(unexp);
 }
@@ -1150,6 +1160,7 @@ out_unlock:
        exp_writeunlock();
        return err;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 /*
 * Obtain the root fh on behalf of a client.
@@ -1459,25 +1470,43 @@ static void show_secinfo_flags(struct seq_file *m, int flags)
        show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
 }
+static bool secinfo_flags_equal(int f, int g)
+{
+        f &= NFSEXP_SECINFO_FLAGS;
+        g &= NFSEXP_SECINFO_FLAGS;
+        return f == g;
+}
+static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end)
+{
+        int flags;
+        flags = (*fp)->flags;
+        seq_printf(m, ",sec=%d", (*fp)->pseudoflavor);
+        (*fp)++;
+        while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) {
+                seq_printf(m, ":%d", (*fp)->pseudoflavor);
+                (*fp)++;
+        }
+        return flags;
+}
 static void show_secinfo(struct seq_file *m, struct svc_export *exp)
 {
        struct exp_flavor_info *f;
        struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
-        int lastflags = 0, first = 0;
+        int flags;
        if (exp->ex_nflavors == 0)
                return;
-        for (f = exp->ex_flavors; f < end; f++) {
+        f = exp->ex_flavors;
-                if (first || f->flags != lastflags) {
+        flags = show_secinfo_run(m, &f, end);
-                        if (!first)
+        if (!secinfo_flags_equal(flags, exp->ex_flags))
-                                show_secinfo_flags(m, lastflags);
+                show_secinfo_flags(m, flags);
-                        seq_printf(m, ",sec=%d", f->pseudoflavor);
+        while (f != end) {
-                        lastflags = f->flags;
+                flags = show_secinfo_run(m, &f, end);
-                } else {
+                show_secinfo_flags(m, flags);
-                        seq_printf(m, ":%d", f->pseudoflavor);
-                }
        }
-        show_secinfo_flags(m, lastflags);
 }
 static void exp_flags(struct seq_file *m, int flag, int fsid,
@@ -1532,6 +1561,7 @@ const struct seq_operations nfs_exports_op = {
        .show   = e_show,
 };
+#ifdef CONFIG_NFSD_DEPRECATED
 /*
 * Add or modify a client.
 * Change requests may involve the list of host addresses. The list of
@@ -1563,7 +1593,7 @@ exp_addclient(struct nfsctl_client *ncp)
        /* Insert client into hashtable. */
        for (i = 0; i < ncp->cl_naddr; i++) {
                ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6);
-                auth_unix_add_addr(&addr6, dom);
+                auth_unix_add_addr(&init_net, &addr6, dom);
        }
        auth_unix_forget_old(dom);
        auth_domain_put(dom);
@@ -1621,6 +1651,7 @@ exp_verify_string(char *cp, int max)
        printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp);
        return 0;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 /*
 * Initialize the exports module.
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 988cbb3a19b6..143da2eecd7b 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -41,7 +41,6 @@
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
-#define NFS4_STATEID_SIZE 16
 /* Index of predefined Linux callback client operations */
@@ -248,10 +247,11 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
 }
 static void
-encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
+encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
                   struct nfs4_cb_compound_hdr *hdr)
 {
        __be32 *p;
+        struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
        if (hdr->minorversion == 0)
                return;
@@ -259,8 +259,8 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
        RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
        WRITE32(OP_CB_SEQUENCE);
-        WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+        WRITEMEM(ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN);
-        WRITE32(args->cbs_clp->cl_cb_seq_nr);
+        WRITE32(ses->se_cb_seq_nr);
        WRITE32(0);             /* slotid, always 0 */
        WRITE32(0);             /* highest slotid always 0 */
        WRITE32(0);             /* cachethis always 0 */
@@ -280,18 +280,18 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
 static int
 nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
-                struct nfs4_rpc_args *rpc_args)
+                struct nfsd4_callback *cb)
 {
        struct xdr_stream xdr;
-        struct nfs4_delegation *args = rpc_args->args_op;
+        struct nfs4_delegation *args = cb->cb_op;
        struct nfs4_cb_compound_hdr hdr = {
-                .ident = args->dl_ident,
+                .ident = cb->cb_clp->cl_cb_ident,
-                .minorversion = rpc_args->args_seq.cbs_minorversion,
+                .minorversion = cb->cb_minorversion,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_cb_compound_hdr(&xdr, &hdr);
-        encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
+        encode_cb_sequence(&xdr, cb, &hdr);
        encode_cb_recall(&xdr, args, &hdr);
        encode_cb_nops(&hdr);
        return 0;
@@ -339,15 +339,16 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 * with a single slot.
 */
 static int
-decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
+decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_callback *cb,
                   struct rpc_rqst *rqstp)
 {
+        struct nfsd4_session *ses = cb->cb_clp->cl_cb_session;
        struct nfs4_sessionid id;
        int status;
        u32 dummy;
        __be32 *p;
-        if (res->cbs_minorversion == 0)
+        if (cb->cb_minorversion == 0)
                return 0;
        status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
@@ -363,13 +364,12 @@ decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
        READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
        memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
        p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
-        if (memcmp(id.data, res->cbs_clp->cl_sessionid.data,
+        if (memcmp(id.data, ses->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
-                   NFS4_MAX_SESSIONID_LEN)) {
                dprintk("%s Invalid session id\n", __func__);
                goto out;
        }
        READ32(dummy);
-        if (dummy != res->cbs_clp->cl_cb_seq_nr) {
+        if (dummy != ses->se_cb_seq_nr) {
                dprintk("%s Invalid sequence number\n", __func__);
                goto out;
        }
@@ -393,7 +393,7 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
 static int
 nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
-                struct nfsd4_cb_sequence *seq)
+                struct nfsd4_callback *cb)
 {
        struct xdr_stream xdr;
        struct nfs4_cb_compound_hdr hdr;
@@ -403,8 +403,8 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
        status = decode_cb_compound_hdr(&xdr, &hdr);
        if (status)
                goto out;
-        if (seq) {
+        if (cb) {
-                status = decode_cb_sequence(&xdr, seq, rqstp);
+                status = decode_cb_sequence(&xdr, cb, rqstp);
                if (status)
                        goto out;
        }
@@ -473,30 +473,34 @@ static int max_cb_time(void)
 /* Reference counting, callback cleanup, etc., all look racy as heck.
 * And why is cl_cb_set an atomic? */
-int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
+int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
 {
        struct rpc_timeout      timeparms = {
                .to_initval     = max_cb_time(),
                .to_retries     = 0,
        };
        struct rpc_create_args args = {
-                .protocol       = XPRT_TRANSPORT_TCP,
+                .net            = &init_net,
-                .address        = (struct sockaddr *) &cb->cb_addr,
+                .address        = (struct sockaddr *) &conn->cb_addr,
-                .addrsize       = cb->cb_addrlen,
+                .addrsize       = conn->cb_addrlen,
                .timeout        = &timeparms,
                .program        = &cb_program,
-                .prognumber     = cb->cb_prog,
                .version        = 0,
                .authflavor     = clp->cl_flavor,
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
-                .client_name    = clp->cl_principal,
        };
        struct rpc_clnt *client;
-        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+        if (clp->cl_minorversion == 0) {
-                return -EINVAL;
+                if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
-        if (cb->cb_minorversion) {
+                        return -EINVAL;
-                args.bc_xprt = cb->cb_xprt;
+                args.client_name = clp->cl_principal;
+                args.prognumber = conn->cb_prog,
+                args.protocol = XPRT_TRANSPORT_TCP;
+                clp->cl_cb_ident = conn->cb_ident;
+        } else {
+                args.bc_xprt = conn->cb_xprt;
+                args.prognumber = clp->cl_cb_session->se_cb_prog;
                args.protocol = XPRT_TRANSPORT_BC_TCP;
        }
        /* Create RPC client */
@@ -506,7 +510,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
                        PTR_ERR(client));
                return PTR_ERR(client);
        }
-        nfsd4_set_callback_client(clp, client);
+        clp->cl_cb_client = client;
        return 0;
 }
@@ -519,7 +523,7 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
 static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 {
-        struct nfs4_client *clp = calldata;
+        struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
        if (task->tk_status)
                warn_no_callback_path(clp, task->tk_status);
@@ -528,6 +532,8 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 }
 static const struct rpc_call_ops nfsd4_cb_probe_ops = {
+        /* XXX: release method to ensure we set the cb channel down if
+         * necessary on early failure? */
        .rpc_call_done = nfsd4_cb_probe_done,
 };
@@ -543,38 +549,42 @@ int set_callback_cred(void)
        return 0;
 }
+static struct workqueue_struct *callback_wq;
-void do_probe_callback(struct nfs4_client *clp)
+static void do_probe_callback(struct nfs4_client *clp)
 {
-        struct rpc_message msg = {
+        struct nfsd4_callback *cb = &clp->cl_cb_null;
-                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-                .rpc_argp       = clp,
-                .rpc_cred       = callback_cred
-        };
-        int status;
-        status = rpc_call_async(clp->cl_cb_client, &msg,
+        cb->cb_op = NULL;
-                                RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+        cb->cb_clp = clp;
-                                &nfsd4_cb_probe_ops, (void *)clp);
-        if (status)
+        cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
-                warn_no_callback_path(clp, status);
+        cb->cb_msg.rpc_argp = NULL;
+        cb->cb_msg.rpc_resp = NULL;
+        cb->cb_msg.rpc_cred = callback_cred;
+        cb->cb_ops = &nfsd4_cb_probe_ops;
+        queue_work(callback_wq, &cb->cb_work);
 }
 /*
- * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
+ * Poke the callback thread to process any updates to the callback
+ * parameters, and send a null probe.
 */
-void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
+void nfsd4_probe_callback(struct nfs4_client *clp)
 {
-        int status;
+        set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+        do_probe_callback(clp);
+}
+void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+{
        BUG_ON(atomic_read(&clp->cl_cb_set));
-        status = setup_callback_client(clp, cb);
+        spin_lock(&clp->cl_lock);
-        if (status) {
+        memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
-                warn_no_callback_path(clp, status);
+        spin_unlock(&clp->cl_lock);
-                return;
-        }
-        do_probe_callback(clp);
 }
 /*
@@ -585,8 +595,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
 static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
                struct rpc_task *task)
 {
-        struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+        u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
-        u32 *ptr = (u32 *)clp->cl_sessionid.data;
        int status = 0;
        dprintk("%s: %u:%u:%u:%u\n", __func__,
@@ -598,14 +607,6 @@ static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
                status = -EAGAIN;
                goto out;
        }
-        /*
-         * We'll need the clp during XDR encoding and decoding,
-         * and the sequence during decoding to verify the reply
-         */
-        args->args_seq.cbs_clp = clp;
-        task->tk_msg.rpc_resp = &args->args_seq;
 out:
        dprintk("%s status=%d\n", __func__, status);
        return status;
@@ -617,13 +618,13 @@ out:
 */
 static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 {
-        struct nfs4_delegation *dp = calldata;
+        struct nfsd4_callback *cb = calldata;
+        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        struct nfs4_client *clp = dp->dl_client;
-        struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+        u32 minorversion = clp->cl_minorversion;
-        u32 minorversion = clp->cl_cb_conn.cb_minorversion;
        int status = 0;
-        args->args_seq.cbs_minorversion = minorversion;
+        cb->cb_minorversion = minorversion;
        if (minorversion) {
                status = nfsd41_cb_setup_sequence(clp, task);
                if (status) {
@@ -640,19 +641,20 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 {
-        struct nfs4_delegation *dp = calldata;
+        struct nfsd4_callback *cb = calldata;
+        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        struct nfs4_client *clp = dp->dl_client;
        dprintk("%s: minorversion=%d\n", __func__,
-                clp->cl_cb_conn.cb_minorversion);
+                clp->cl_minorversion);
-        if (clp->cl_cb_conn.cb_minorversion) {
+        if (clp->cl_minorversion) {
                /* No need for lock, access serialized in nfsd4_cb_prepare */
-                ++clp->cl_cb_seq_nr;
+                ++clp->cl_cb_session->se_cb_seq_nr;
                clear_bit(0, &clp->cl_cb_slot_busy);
                rpc_wake_up_next(&clp->cl_cb_waitq);
                dprintk("%s: freed slot, new seqid=%d\n", __func__,
-                        clp->cl_cb_seq_nr);
+                        clp->cl_cb_session->se_cb_seq_nr);
                /* We're done looking into the sequence information */
                task->tk_msg.rpc_resp = NULL;
@@ -662,7 +664,8 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
-        struct nfs4_delegation *dp = calldata;
+        struct nfsd4_callback *cb = calldata;
+        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        struct nfs4_client *clp = dp->dl_client;
        struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
@@ -707,7 +710,8 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 static void nfsd4_cb_recall_release(void *calldata)
 {
-        struct nfs4_delegation *dp = calldata;
+        struct nfsd4_callback *cb = calldata;
+        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
        nfs4_put_delegation(dp);
 }
@@ -718,8 +722,6 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
        .rpc_release = nfsd4_cb_recall_release,
 };
-static struct workqueue_struct *callback_wq;
 int nfsd4_create_callback_queue(void)
 {
        callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
@@ -734,57 +736,88 @@ void nfsd4_destroy_callback_queue(void)
 }
 /* must be called under the state lock */
-void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
+void nfsd4_shutdown_callback(struct nfs4_client *clp)
 {
-        struct rpc_clnt *old = clp->cl_cb_client;
+        set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags);
-        clp->cl_cb_client = new;
        /*
-         * After this, any work that saw the old value of cl_cb_client will
+         * Note this won't actually result in a null callback;
-         * be gone:
+         * instead, nfsd4_do_callback_rpc() will detect the killed
+         * client, destroy the rpc client, and stop:
         */
+        do_probe_callback(clp);
        flush_workqueue(callback_wq);
-        /* So we can safely shut it down: */
-        if (old)
-                rpc_shutdown_client(old);
 }
-/*
+void nfsd4_release_cb(struct nfsd4_callback *cb)
- * called with dp->dl_count inc'ed.
- */
-static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
-        struct nfs4_client *clp = dp->dl_client;
+        if (cb->cb_ops->rpc_release)
-        struct rpc_clnt *clnt = clp->cl_cb_client;
+                cb->cb_ops->rpc_release(cb);
-        struct nfs4_rpc_args *args = &dp->dl_recall.cb_args;
+}
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
-                .rpc_cred = callback_cred
-        };
-        if (clnt == NULL) {
+void nfsd4_process_cb_update(struct nfsd4_callback *cb)
-                nfs4_put_delegation(dp);
+{
-                return; /* Client is shutting down; give up. */
+        struct nfs4_cb_conn conn;
+        struct nfs4_client *clp = cb->cb_clp;
+        int err;
+        /*
+         * This is either an update, or the client dying; in either case,
+         * kill the old client:
+         */
+        if (clp->cl_cb_client) {
+                rpc_shutdown_client(clp->cl_cb_client);
+                clp->cl_cb_client = NULL;
        }
+        if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
+                return;
+        spin_lock(&clp->cl_lock);
+        /*
+         * Only serialized callback code is allowed to clear these
+         * flags; main nfsd code can only set them:
+         */
+        BUG_ON(!clp->cl_cb_flags);
+        clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
+        memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
+        spin_unlock(&clp->cl_lock);
-        args->args_op = dp;
+        err = setup_callback_client(clp, &conn);
-        msg.rpc_argp = args;
+        if (err)
-        dp->dl_retries = 1;
+                warn_no_callback_path(clp, err);
-        rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp);
 }
 void nfsd4_do_callback_rpc(struct work_struct *w)
 {
-        /* XXX: for now, just send off delegation recall. */
+        struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
-        /* In future, generalize to handle any sort of callback. */
+        struct nfs4_client *clp = cb->cb_clp;
-        struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work);
+        struct rpc_clnt *clnt;
-        struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
-        _nfsd4_cb_recall(dp);
+        if (clp->cl_cb_flags)
-}
+                nfsd4_process_cb_update(cb);
+        clnt = clp->cl_cb_client;
+        if (!clnt) {
+                /* Callback channel broken, or client killed; give up: */
+                nfsd4_release_cb(cb);
+                return;
+        }
+        rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+                        cb->cb_ops, cb);
+}
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
+        struct nfsd4_callback *cb = &dp->dl_recall;
+        dp->dl_retries = 1;
+        cb->cb_op = dp;
+        cb->cb_clp = dp->dl_client;
+        cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
+        cb->cb_msg.rpc_argp = cb;
+        cb->cb_msg.rpc_resp = cb;
+        cb->cb_msg.rpc_cred = callback_cred;
+        cb->cb_ops = &nfsd4_cb_recall_ops;
+        dp->dl_retries = 1;
        queue_work(callback_wq, &dp->dl_recall.cb_work);
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index c78dbf493424..f0695e815f0e 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -482,109 +482,26 @@ nfsd_idmap_shutdown(void)
        cache_unregister(&nametoid_cache);
 }
-/*
- * Deferred request handling
- */
-struct idmap_defer_req {
-       struct cache_req         req;
-       struct cache_deferred_req deferred_req;
-       wait_queue_head_t        waitq;
-       atomic_t                 count;
-};
-static inline void
-put_mdr(struct idmap_defer_req *mdr)
-{
-        if (atomic_dec_and_test(&mdr->count))
-                kfree(mdr);
-}
-static inline void
-get_mdr(struct idmap_defer_req *mdr)
-{
-        atomic_inc(&mdr->count);
-}
-static void
-idmap_revisit(struct cache_deferred_req *dreq, int toomany)
-{
-        struct idmap_defer_req *mdr =
-                container_of(dreq, struct idmap_defer_req, deferred_req);
-        wake_up(&mdr->waitq);
-        put_mdr(mdr);
-}
-static struct cache_deferred_req *
-idmap_defer(struct cache_req *req)
-{
-        struct idmap_defer_req *mdr =
-                container_of(req, struct idmap_defer_req, req);
-        mdr->deferred_req.revisit = idmap_revisit;
-        get_mdr(mdr);
-        return (&mdr->deferred_req);
-}
-static inline int
-do_idmap_lookup(struct ent *(*lookup_fn)(struct ent *), struct ent *key,
-                struct cache_detail *detail, struct ent **item,
-                struct idmap_defer_req *mdr)
-{
-        *item = lookup_fn(key);
-        if (!*item)
-                return -ENOMEM;
-        return cache_check(detail, &(*item)->h, &mdr->req);
-}
-static inline int
-do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *),
-                        struct ent *key, struct cache_detail *detail,
-                        struct ent **item)
-{
-        int ret = -ENOMEM;
-        *item = lookup_fn(key);
-        if (!*item)
-                goto out_err;
-        ret = -ETIMEDOUT;
-        if (!test_bit(CACHE_VALID, &(*item)->h.flags)
-                        || (*item)->h.expiry_time < get_seconds()
-                        || detail->flush_time > (*item)->h.last_refresh)
-                goto out_put;
-        ret = -ENOENT;
-        if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
-                goto out_put;
-        return 0;
-out_put:
-        cache_put(&(*item)->h, detail);
-out_err:
-        *item = NULL;
-        return ret;
-}
 static int
 idmap_lookup(struct svc_rqst *rqstp,
                struct ent *(*lookup_fn)(struct ent *), struct ent *key,
                struct cache_detail *detail, struct ent **item)
 {
-        struct idmap_defer_req *mdr;
        int ret;
-        mdr = kzalloc(sizeof(*mdr), GFP_KERNEL);
+        *item = lookup_fn(key);
-        if (!mdr)
+        if (!*item)
                return -ENOMEM;
-        atomic_set(&mdr->count, 1);
+ retry:
-        init_waitqueue_head(&mdr->waitq);
+        ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle);
-        mdr->req.defer = idmap_defer;
-        ret = do_idmap_lookup(lookup_fn, key, detail, item, mdr);
+        if (ret == -ETIMEDOUT) {
-        if (ret == -EAGAIN) {
+                struct ent *prev_item = *item;
-                wait_event_interruptible_timeout(mdr->waitq,
+                *item = lookup_fn(key);
-                        test_bit(CACHE_VALID, &(*item)->h.flags), 1 * HZ);
+                if (*item != prev_item)
-                ret = do_idmap_lookup_nowait(lookup_fn, key, detail, item);
+                        goto retry;
+                cache_put(&(*item)->h, detail);
        }
-        put_mdr(mdr);
        return ret;
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 59ec449b0c7f..0cdfd022bb7b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1031,8 +1031,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        resp->cstate.session = NULL;
        fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
        fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
-        /* Use the deferral mechanism only for NFSv4.0 compounds */
+        /*
-        rqstp->rq_usedeferral = (args->minorversion == 0);
+         * Don't use the deferral mechanism for NFSv4; compounds make it
+         * too hard to avoid non-idempotency problems.
+         */
+        rqstp->rq_usedeferral = 0;
        /*
         * According to RFC3010, this takes precedence over all other errors.
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cf0d2ffb3c84..ad2bfa68d534 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -33,7 +33,7 @@
 */
 #include <linux/file.h>
-#include <linux/smp_lock.h>
+#include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/swap.h>
@@ -207,7 +207,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 {
        struct nfs4_delegation *dp;
        struct nfs4_file *fp = stp->st_file;
-        struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
        dprintk("NFSD alloc_init_deleg\n");
        /*
@@ -234,7 +233,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        nfs4_file_get_access(fp, O_RDONLY);
        dp->dl_flock = NULL;
        dp->dl_type = type;
-        dp->dl_ident = cb->cb_ident;
        dp->dl_stateid.si_boot = boot_time;
        dp->dl_stateid.si_stateownerid = current_delegid++;
        dp->dl_stateid.si_fileid = 0;
@@ -535,171 +533,262 @@ gen_sessionid(struct nfsd4_session *ses)
 */
 #define NFSD_MIN_HDR_SEQ_SZ  (24 + 12 + 44)
+static void
+free_session_slots(struct nfsd4_session *ses)
+{
+        int i;
+        for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+                kfree(ses->se_slots[i]);
+}
 /*
- * Give the client the number of ca_maxresponsesize_cached slots it
+ * We don't actually need to cache the rpc and session headers, so we
- * requests, of size bounded by NFSD_SLOT_CACHE_SIZE,
+ * can allocate a little less for each slot:
- * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more
+ */
- * than NFSD_MAX_SLOTS_PER_SESSION.
+static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
- *
+{
- * If we run out of reserved DRC memory we should (up to a point)
+        return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+}
+static int nfsd4_sanitize_slot_size(u32 size)
+{
+        size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */
+        size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE);
+        return size;
+}
+/*
+ * XXX: If we run out of reserved DRC memory we could (up to a point)
 * re-negotiate active sessions and reduce their slot usage to make
 * rooom for new connections. For now we just fail the create session.
 */
-static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
+static int nfsd4_get_drc_mem(int slotsize, u32 num)
 {
-        int mem, size = fchan->maxresp_cached;
+        int avail;
-        if (fchan->maxreqs < 1)
+        num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
-                return nfserr_inval;
-        if (size < NFSD_MIN_HDR_SEQ_SZ)
+        spin_lock(&nfsd_drc_lock);
-                size = NFSD_MIN_HDR_SEQ_SZ;
+        avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
-        size -= NFSD_MIN_HDR_SEQ_SZ;
+                        nfsd_drc_max_mem - nfsd_drc_mem_used);
-        if (size > NFSD_SLOT_CACHE_SIZE)
+        num = min_t(int, num, avail / slotsize);
-                size = NFSD_SLOT_CACHE_SIZE;
+        nfsd_drc_mem_used += num * slotsize;
+        spin_unlock(&nfsd_drc_lock);
-        /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */
-        mem = fchan->maxreqs * size;
+        return num;
-        if (mem > NFSD_MAX_MEM_PER_SESSION) {
+}
-                fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
-                if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
-                        fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
-                mem = fchan->maxreqs * size;
-        }
+static void nfsd4_put_drc_mem(int slotsize, int num)
+{
        spin_lock(&nfsd_drc_lock);
-        /* bound the total session drc memory ussage */
+        nfsd_drc_mem_used -= slotsize * num;
-        if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
-                fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
-                mem = fchan->maxreqs * size;
-        }
-        nfsd_drc_mem_used += mem;
        spin_unlock(&nfsd_drc_lock);
+}
-        if (fchan->maxreqs == 0)
+static struct nfsd4_session *alloc_session(int slotsize, int numslots)
-                return nfserr_jukebox;
+{
+        struct nfsd4_session *new;
+        int mem, i;
-        fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
+        BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
-        return 0;
+                        + sizeof(struct nfsd4_session) > PAGE_SIZE);
+        mem = numslots * sizeof(struct nfsd4_slot *);
+        new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
+        if (!new)
+                return NULL;
+        /* allocate each struct nfsd4_slot and data cache in one piece */
+        for (i = 0; i < numslots; i++) {
+                mem = sizeof(struct nfsd4_slot) + slotsize;
+                new->se_slots[i] = kzalloc(mem, GFP_KERNEL);
+                if (!new->se_slots[i])
+                        goto out_free;
+        }
+        return new;
+out_free:
+        while (i--)
+                kfree(new->se_slots[i]);
+        kfree(new);
+        return NULL;
 }
-/*
+static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize)
- * fchan holds the client values on input, and the server values on output
- * sv_max_mesg is the maximum payload plus one page for overhead.
- */
-static int init_forechannel_attrs(struct svc_rqst *rqstp,
-                                  struct nfsd4_channel_attrs *session_fchan,
-                                  struct nfsd4_channel_attrs *fchan)
 {
-        int status = 0;
+        u32 maxrpc = nfsd_serv->sv_max_mesg;
-        __u32   maxcount = nfsd_serv->sv_max_mesg;
-        /* headerpadsz set to zero in encode routine */
+        new->maxreqs = numslots;
+        new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ;
+        new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
+        new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
+        new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
+}
-        /* Use the client's max request and max response size if possible */
+static void free_conn(struct nfsd4_conn *c)
-        if (fchan->maxreq_sz > maxcount)
+{
-                fchan->maxreq_sz = maxcount;
+        svc_xprt_put(c->cn_xprt);
-        session_fchan->maxreq_sz = fchan->maxreq_sz;
+        kfree(c);
+}
-        if (fchan->maxresp_sz > maxcount)
+static void nfsd4_conn_lost(struct svc_xpt_user *u)
-                fchan->maxresp_sz = maxcount;
+{
-        session_fchan->maxresp_sz = fchan->maxresp_sz;
+        struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
+        struct nfs4_client *clp = c->cn_session->se_client;
-        /* Use the client's maxops if possible */
+        spin_lock(&clp->cl_lock);
-        if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
+        if (!list_empty(&c->cn_persession)) {
-                fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
+                list_del(&c->cn_persession);
-        session_fchan->maxops = fchan->maxops;
+                free_conn(c);
+        }
+        spin_unlock(&clp->cl_lock);
+}
-        /* FIXME: Error means no more DRC pages so the server should
+static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
-         * recover pages from existing sessions. For now fail session
+{
-         * creation.
+        struct nfsd4_conn *conn;
-         */
-        status = set_forechannel_drc_size(fchan);
-        session_fchan->maxresp_cached = fchan->maxresp_cached;
+        conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL);
-        session_fchan->maxreqs = fchan->maxreqs;
+        if (!conn)
+                return NULL;
+        svc_xprt_get(rqstp->rq_xprt);
+        conn->cn_xprt = rqstp->rq_xprt;
+        conn->cn_flags = flags;
+        INIT_LIST_HEAD(&conn->cn_xpt_user.list);
+        return conn;
+}
-        dprintk("%s status %d\n", __func__, status);
+static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
-        return status;
+{
+        conn->cn_session = ses;
+        list_add(&conn->cn_persession, &ses->se_conns);
 }
-static void
+static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
-free_session_slots(struct nfsd4_session *ses)
 {
-        int i;
+        struct nfs4_client *clp = ses->se_client;
-        for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+        spin_lock(&clp->cl_lock);
-                kfree(ses->se_slots[i]);
+        __nfsd4_hash_conn(conn, ses);
+        spin_unlock(&clp->cl_lock);
 }
-/*
+static int nfsd4_register_conn(struct nfsd4_conn *conn)
- * We don't actually need to cache the rpc and session headers, so we
- * can allocate a little less for each slot:
- */
-static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
 {
-        return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+        conn->cn_xpt_user.callback = nfsd4_conn_lost;
+        return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
 }
-static int
+static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
-alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
-                   struct nfsd4_create_session *cses)
 {
-        struct nfsd4_session *new, tmp;
+        struct nfsd4_conn *conn;
-        struct nfsd4_slot *sp;
+        u32 flags = NFS4_CDFC4_FORE;
-        int idx, slotsize, cachesize, i;
+        int ret;
-        int status;
-        memset(&tmp, 0, sizeof(tmp));
+        if (ses->se_flags & SESSION4_BACK_CHAN)
+                flags |= NFS4_CDFC4_BACK;
+        conn = alloc_conn(rqstp, flags);
+        if (!conn)
+                return nfserr_jukebox;
+        nfsd4_hash_conn(conn, ses);
+        ret = nfsd4_register_conn(conn);
+        if (ret)
+                /* oops; xprt is already down: */
+                nfsd4_conn_lost(&conn->cn_xpt_user);
+        return nfs_ok;
+}
-        /* FIXME: For now, we just accept the client back channel attributes. */
+static void nfsd4_del_conns(struct nfsd4_session *s)
-        tmp.se_bchannel = cses->back_channel;
+{
-        status = init_forechannel_attrs(rqstp, &tmp.se_fchannel,
+        struct nfs4_client *clp = s->se_client;
-                                        &cses->fore_channel);
+        struct nfsd4_conn *c;
-        if (status)
-                goto out;
-        BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
+        spin_lock(&clp->cl_lock);
-                     + sizeof(struct nfsd4_session) > PAGE_SIZE);
+        while (!list_empty(&s->se_conns)) {
+                c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession);
+                list_del_init(&c->cn_persession);
+                spin_unlock(&clp->cl_lock);
-        status = nfserr_jukebox;
+                unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user);
-        /* allocate struct nfsd4_session and slot table pointers in one piece */
+                free_conn(c);
-        slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
-        new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
-        if (!new)
-                goto out;
-        memcpy(new, &tmp, sizeof(*new));
+                spin_lock(&clp->cl_lock);
+        }
+        spin_unlock(&clp->cl_lock);
+}
-        /* allocate each struct nfsd4_slot and data cache in one piece */
+void free_session(struct kref *kref)
-        cachesize = slot_bytes(&new->se_fchannel);
+{
-        for (i = 0; i < new->se_fchannel.maxreqs; i++) {
+        struct nfsd4_session *ses;
-                sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
+        int mem;
-                if (!sp)
-                        goto out_free;
+        ses = container_of(kref, struct nfsd4_session, se_ref);
-                new->se_slots[i] = sp;
+        nfsd4_del_conns(ses);
+        spin_lock(&nfsd_drc_lock);
+        mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
+        nfsd_drc_mem_used -= mem;
+        spin_unlock(&nfsd_drc_lock);
+        free_session_slots(ses);
+        kfree(ses);
+}
+static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+{
+        struct nfsd4_session *new;
+        struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
+        int numslots, slotsize;
+        int status;
+        int idx;
+        /*
+         * Note decreasing slot size below client's request may
+         * make it difficult for client to function correctly, whereas
+         * decreasing the number of slots will (just?) affect
+         * performance.  When short on memory we therefore prefer to
+         * decrease number of slots instead of their size.
+         */
+        slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
+        numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
+        new = alloc_session(slotsize, numslots);
+        if (!new) {
+                nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
+                return NULL;
        }
+        init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
        new->se_client = clp;
        gen_sessionid(new);
-        idx = hash_sessionid(&new->se_sessionid);
-        memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
-               NFS4_MAX_SESSIONID_LEN);
+        INIT_LIST_HEAD(&new->se_conns);
+        new->se_cb_seq_nr = 1;
        new->se_flags = cses->flags;
+        new->se_cb_prog = cses->callback_prog;
        kref_init(&new->se_ref);
+        idx = hash_sessionid(&new->se_sessionid);
        spin_lock(&client_lock);
        list_add(&new->se_hash, &sessionid_hashtbl[idx]);
        list_add(&new->se_perclnt, &clp->cl_sessions);
        spin_unlock(&client_lock);
-        status = nfs_ok;
+        status = nfsd4_new_conn(rqstp, new);
-out:
+        /* whoops: benny points out, status is ignored! (err, or bogus) */
-        return status;
+        if (status) {
-out_free:
+                free_session(&new->se_ref);
-        free_session_slots(new);
+                return NULL;
-        kfree(new);
+        }
-        goto out;
+        if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) {
+                struct sockaddr *sa = svc_addr(rqstp);
+                clp->cl_cb_session = new;
+                clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
+                svc_xprt_get(rqstp->rq_xprt);
+                rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
+                clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
+                nfsd4_probe_callback(clp);
+        }
+        return new;
 }
 /* caller must hold client_lock */
@@ -731,21 +820,6 @@ unhash_session(struct nfsd4_session *ses)
        list_del(&ses->se_perclnt);
 }
-void
-free_session(struct kref *kref)
-{
-        struct nfsd4_session *ses;
-        int mem;
-        ses = container_of(kref, struct nfsd4_session, se_ref);
-        spin_lock(&nfsd_drc_lock);
-        mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
-        nfsd_drc_mem_used -= mem;
-        spin_unlock(&nfsd_drc_lock);
-        free_session_slots(ses);
-        kfree(ses);
-}
 /* must be called under the client_lock */
 static inline void
 renew_client_locked(struct nfs4_client *clp)
@@ -812,6 +886,13 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 static inline void
 free_client(struct nfs4_client *clp)
 {
+        while (!list_empty(&clp->cl_sessions)) {
+                struct nfsd4_session *ses;
+                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+                                se_perclnt);
+                list_del(&ses->se_perclnt);
+                nfsd4_put_session(ses);
+        }
        if (clp->cl_cred.cr_group_info)
                put_group_info(clp->cl_cred.cr_group_info);
        kfree(clp->cl_principal);
@@ -838,15 +919,12 @@ release_session_client(struct nfsd4_session *session)
 static inline void
 unhash_client_locked(struct nfs4_client *clp)
 {
+        struct nfsd4_session *ses;
        mark_client_expired(clp);
        list_del(&clp->cl_lru);
-        while (!list_empty(&clp->cl_sessions)) {
+        list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
-                struct nfsd4_session  *ses;
+                list_del_init(&ses->se_hash);
-                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
-                                 se_perclnt);
-                unhash_session(ses);
-                nfsd4_put_session(ses);
-        }
 }
 static void
@@ -875,7 +953,7 @@ expire_client(struct nfs4_client *clp)
                sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
                release_openowner(sop);
        }
-        nfsd4_set_callback_client(clp, NULL);
+        nfsd4_shutdown_callback(clp);
        if (clp->cl_cb_conn.cb_xprt)
                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
        list_del(&clp->cl_idhash);
@@ -960,6 +1038,8 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        if (clp == NULL)
                return NULL;
+        INIT_LIST_HEAD(&clp->cl_sessions);
        princ = svc_gss_principal(rqstp);
        if (princ) {
                clp->cl_principal = kstrdup(princ, GFP_KERNEL);
@@ -976,8 +1056,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
-        INIT_LIST_HEAD(&clp->cl_sessions);
        INIT_LIST_HEAD(&clp->cl_lru);
+        spin_lock_init(&clp->cl_lock);
+        INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
        clp->cl_time = get_seconds();
        clear_bit(0, &clp->cl_cb_slot_busy);
        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
@@ -986,7 +1067,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        clp->cl_flavor = rqstp->rq_flavor;
        copy_cred(&clp->cl_cred, &rqstp->rq_cred);
        gen_confirm(clp);
+        clp->cl_cb_session = NULL;
        return clp;
 }
@@ -1098,7 +1179,7 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
 static void
 gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
 {
-        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
+        struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
        unsigned short expected_family;
        /* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1111,24 +1192,23 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
        else
                goto out_err;
-        cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
+        conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
                                            se->se_callback_addr_len,
-                                            (struct sockaddr *) &cb->cb_addr,
+                                            (struct sockaddr *)&conn->cb_addr,
-                                            sizeof(cb->cb_addr));
+                                            sizeof(conn->cb_addr));
-        if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family)
+        if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family)
                goto out_err;
-        if (cb->cb_addr.ss_family == AF_INET6)
+        if (conn->cb_addr.ss_family == AF_INET6)
-                ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid;
+                ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid;
-        cb->cb_minorversion = 0;
+        conn->cb_prog = se->se_callback_prog;
-        cb->cb_prog = se->se_callback_prog;
+        conn->cb_ident = se->se_callback_ident;
-        cb->cb_ident = se->se_callback_ident;
        return;
 out_err:
-        cb->cb_addr.ss_family = AF_UNSPEC;
+        conn->cb_addr.ss_family = AF_UNSPEC;
-        cb->cb_addrlen = 0;
+        conn->cb_addrlen = 0;
        dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
                "will not receive delegations\n",
                clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
@@ -1415,7 +1495,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 {
        struct sockaddr *sa = svc_addr(rqstp);
        struct nfs4_client *conf, *unconf;
+        struct nfsd4_session *new;
        struct nfsd4_clid_slot *cs_slot = NULL;
+        bool confirm_me = false;
        int status = 0;
        nfs4_lock_state();
@@ -1438,7 +1520,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                                cs_slot->sl_seqid, cr_ses->seqid);
                        goto out;
                }
-                cs_slot->sl_seqid++;
        } else if (unconf) {
                if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
                    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1451,25 +1532,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                if (status) {
                        /* an unconfirmed replay returns misordered */
                        status = nfserr_seq_misordered;
-                        goto out_cache;
+                        goto out;
                }
-                cs_slot->sl_seqid++; /* from 0 to 1 */
+                confirm_me = true;
-                move_to_confirmed(unconf);
-                if (cr_ses->flags & SESSION4_BACK_CHAN) {
-                        unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
-                        svc_xprt_get(rqstp->rq_xprt);
-                        rpc_copy_addr(
-                                (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
-                                sa);
-                        unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
-                        unconf->cl_cb_conn.cb_minorversion =
-                                cstate->minorversion;
-                        unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
-                        unconf->cl_cb_seq_nr = 1;
-                        nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
-                }
                conf = unconf;
        } else {
                status = nfserr_stale_clientid;
@@ -1477,22 +1543,30 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        }
        /*
+         * XXX: we should probably set this at creation time, and check
+         * for consistent minorversion use throughout:
+         */
+        conf->cl_minorversion = 1;
+        /*
         * We do not support RDMA or persistent sessions
         */
        cr_ses->flags &= ~SESSION4_PERSIST;
        cr_ses->flags &= ~SESSION4_RDMA;
-        status = alloc_init_session(rqstp, conf, cr_ses);
+        status = nfserr_jukebox;
-        if (status)
+        new = alloc_init_session(rqstp, conf, cr_ses);
+        if (!new)
                goto out;
+        status = nfs_ok;
-        memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
+        memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
               NFS4_MAX_SESSIONID_LEN);
+        cs_slot->sl_seqid++;
        cr_ses->seqid = cs_slot->sl_seqid;
-out_cache:
        /* cache solo and embedded create sessions under the state lock */
        nfsd4_cache_create_session(cr_ses, cs_slot, status);
+        if (confirm_me)
+                move_to_confirmed(conf);
 out:
        nfs4_unlock_state();
        dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1546,8 +1620,11 @@ nfsd4_destroy_session(struct svc_rqst *r,
        nfs4_lock_state();
        /* wait for callbacks */
-        nfsd4_set_callback_client(ses->se_client, NULL);
+        nfsd4_shutdown_callback(ses->se_client);
        nfs4_unlock_state();
+        nfsd4_del_conns(ses);
        nfsd4_put_session(ses);
        status = nfs_ok;
 out:
@@ -1555,6 +1632,40 @@ out:
        return status;
 }
+static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
+{
+        struct nfsd4_conn *c;
+        list_for_each_entry(c, &s->se_conns, cn_persession) {
+                if (c->cn_xprt == xpt) {
+                        return c;
+                }
+        }
+        return NULL;
+}
+static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
+{
+        struct nfs4_client *clp = ses->se_client;
+        struct nfsd4_conn *c;
+        int ret;
+        spin_lock(&clp->cl_lock);
+        c = __nfsd4_find_conn(new->cn_xprt, ses);
+        if (c) {
+                spin_unlock(&clp->cl_lock);
+                free_conn(new);
+                return;
+        }
+        __nfsd4_hash_conn(new, ses);
+        spin_unlock(&clp->cl_lock);
+        ret = nfsd4_register_conn(new);
+        if (ret)
+                /* oops; xprt is already down: */
+                nfsd4_conn_lost(&new->cn_xpt_user);
+        return;
+}
 __be32
 nfsd4_sequence(struct svc_rqst *rqstp,
               struct nfsd4_compound_state *cstate,
@@ -1563,11 +1674,20 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        struct nfsd4_compoundres *resp = rqstp->rq_resp;
        struct nfsd4_session *session;
        struct nfsd4_slot *slot;
+        struct nfsd4_conn *conn;
        int status;
        if (resp->opcnt != 1)
                return nfserr_sequence_pos;
+        /*
+         * Will be either used or freed by nfsd4_sequence_check_conn
+         * below.
+         */
+        conn = alloc_conn(rqstp, NFS4_CDFC4_FORE);
+        if (!conn)
+                return nfserr_jukebox;
        spin_lock(&client_lock);
        status = nfserr_badsession;
        session = find_in_sessionid_hashtbl(&seq->sessionid);
@@ -1599,6 +1719,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        if (status)
                goto out;
+        nfsd4_sequence_check_conn(conn, session);
+        conn = NULL;
        /* Success! bump slot seqid */
        slot->sl_inuse = true;
        slot->sl_seqid = seq->seqid;
@@ -1613,6 +1736,7 @@ out:
                nfsd4_get_session(cstate->session);
                atomic_inc(&session->se_client->cl_refcount);
        }
+        kfree(conn);
        spin_unlock(&client_lock);
        dprintk("%s: return %d\n", __func__, ntohl(status));
        return status;
@@ -1747,6 +1871,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
                gen_clid(new);
        }
+        /*
+         * XXX: we should probably set this at creation time, and check
+         * for consistent minorversion use throughout:
+         */
+        new->cl_minorversion = 0;
        gen_callback(new, setclid, rpc_get_scope_id(sa));
        add_to_unconfirmed(new, strhashval);
        setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
@@ -1807,7 +1936,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        status = nfserr_clid_inuse;
                else {
                        atomic_set(&conf->cl_cb_set, 0);
-                        nfsd4_probe_callback(conf, &unconf->cl_cb_conn);
+                        nfsd4_change_callback(conf, &unconf->cl_cb_conn);
+                        nfsd4_probe_callback(conf);
                        expire_client(unconf);
                        status = nfs_ok;
@@ -1841,7 +1971,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        }
                        move_to_confirmed(unconf);
                        conf = unconf;
-                        nfsd4_probe_callback(conf, &conf->cl_cb_conn);
+                        nfsd4_probe_callback(conf);
                        status = nfs_ok;
                }
        } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -2188,22 +2318,6 @@ void nfsd_release_deleg_cb(struct file_lock *fl)
 }
 /*
- * Set the delegation file_lock back pointer.
- *
- * Called from setlease() with lock_kernel() held.
- */
-static
-void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl)
-{
-        struct nfs4_delegation *dp = (struct nfs4_delegation *)new->fl_owner;
-        dprintk("NFSD: nfsd_copy_lock_deleg_cb: new fl %p dp %p\n", new, dp);
-        if (!dp)
-                return;
-        dp->dl_flock = new;
-}
-/*
 * Called from setlease() with lock_kernel() held
 */
 static
@@ -2233,7 +2347,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 static const struct lock_manager_operations nfsd_lease_mng_ops = {
        .fl_break = nfsd_break_deleg_cb,
        .fl_release_private = nfsd_release_deleg_cb,
-        .fl_copy_lock = nfsd_copy_lock_deleg_cb,
        .fl_mylease = nfsd_same_client_deleg_cb,
        .fl_change = nfsd_change_deleg_cb,
 };
@@ -2492,7 +2605,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        struct nfs4_delegation *dp;
        struct nfs4_stateowner *sop = stp->st_stateowner;
        int cb_up = atomic_read(&sop->so_client->cl_cb_set);
-        struct file_lock fl, *flp = &fl;
+        struct file_lock *fl;
        int status, flag = 0;
        flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2526,21 +2639,28 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
                flag = NFS4_OPEN_DELEGATE_NONE;
                goto out;
        }
-        locks_init_lock(&fl);
+        status = -ENOMEM;
-        fl.fl_lmops = &nfsd_lease_mng_ops;
+        fl = locks_alloc_lock();
-        fl.fl_flags = FL_LEASE;
+        if (!fl)
-        fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+                goto out;
-        fl.fl_end = OFFSET_MAX;
+        locks_init_lock(fl);
-        fl.fl_owner =  (fl_owner_t)dp;
+        fl->fl_lmops = &nfsd_lease_mng_ops;
-        fl.fl_file = find_readable_file(stp->st_file);
+        fl->fl_flags = FL_LEASE;
-        BUG_ON(!fl.fl_file);
+        fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
-        fl.fl_pid = current->tgid;
+        fl->fl_end = OFFSET_MAX;
+        fl->fl_owner =  (fl_owner_t)dp;
+        fl->fl_file = find_readable_file(stp->st_file);
+        BUG_ON(!fl->fl_file);
+        fl->fl_pid = current->tgid;
+        dp->dl_flock = fl;
        /* vfs_setlease checks to see if delegation should be handed out.
         * the lock_manager callbacks fl_mylease and fl_change are used
         */
-        if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) {
+        if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
                dprintk("NFSD: setlease failed [%d], no delegation\n", status);
+                dp->dl_flock = NULL;
+                locks_free_lock(fl);
                unhash_delegation(dp);
                flag = NFS4_OPEN_DELEGATE_NONE;
                goto out;
@@ -2944,7 +3064,11 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
        if (STALE_STATEID(stateid)) 
                goto out;
-        status = nfserr_bad_stateid;
+        /*
+         * We assume that any stateid that has the current boot time,
+         * but that we can't find, is expired:
+         */
+        status = nfserr_expired;
        if (is_delegation_stateid(stateid)) {
                dp = find_delegation_stateid(ino, stateid);
                if (!dp)
@@ -2964,6 +3088,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                stp = find_stateid(stateid, flags);
                if (!stp)
                        goto out;
+                status = nfserr_bad_stateid;
                if (nfs4_check_fh(current_fh, stp))
                        goto out;
                if (!stp->st_stateowner->so_confirmed)
@@ -3038,8 +3163,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
                 * a replayed close:
                 */
                sop = search_close_lru(stateid->si_stateownerid, flags);
+                /* It's not stale; let's assume it's expired: */
                if (sop == NULL)
-                        return nfserr_bad_stateid;
+                        return nfserr_expired;
                *sopp = sop;
                goto check_replay;
        }
@@ -3304,6 +3430,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        status = nfserr_bad_stateid;
        if (!is_delegation_stateid(stateid))
                goto out;
+        status = nfserr_expired;
        dp = find_delegation_stateid(inode, stateid);
        if (!dp)
                goto out;
@@ -3895,7 +4022,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
        struct inode *inode = filp->fi_inode;
        int status = 0;
-        lock_kernel();
+        lock_flocks();
        for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
                if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
                        status = 1;
@@ -3903,7 +4030,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
                }
        }
 out:
-        unlock_kernel();
+        unlock_flocks();
        return status;
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1a468bbd330f..f35a94a04026 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1805,19 +1805,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                                goto out_nfserr;
                }
        }
-        if ((buflen -= 16) < 0)
-                goto out_resource;
-        if (unlikely(bmval2)) {
+        if (bmval2) {
+                if ((buflen -= 16) < 0)
+                        goto out_resource;
                WRITE32(3);
                WRITE32(bmval0);
                WRITE32(bmval1);
                WRITE32(bmval2);
-        } else if (likely(bmval1)) {
+        } else if (bmval1) {
+                if ((buflen -= 12) < 0)
+                        goto out_resource;
                WRITE32(2);
                WRITE32(bmval0);
                WRITE32(bmval1);
        } else {
+                if ((buflen -= 8) < 0)
+                        goto out_resource;
                WRITE32(1);
                WRITE32(bmval0);
        }
@@ -1828,15 +1832,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                u32 word1 = nfsd_suppattrs1(minorversion);
                u32 word2 = nfsd_suppattrs2(minorversion);
-                if ((buflen -= 12) < 0)
-                        goto out_resource;
                if (!aclsupport)
                        word0 &= ~FATTR4_WORD0_ACL;
                if (!word2) {
+                        if ((buflen -= 12) < 0)
+                                goto out_resource;
                        WRITE32(2);
                        WRITE32(word0);
                        WRITE32(word1);
                } else {
+                        if ((buflen -= 16) < 0)
+                                goto out_resource;
                        WRITE32(3);
                        WRITE32(word0);
                        WRITE32(word1);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b53b1d042f1f..4514ebbee4d6 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -22,6 +22,7 @@
 */
 enum {
        NFSD_Root = 1,
+#ifdef CONFIG_NFSD_DEPRECATED
        NFSD_Svc,
        NFSD_Add,
        NFSD_Del,
@@ -29,6 +30,7 @@ enum {
        NFSD_Unexport,
        NFSD_Getfd,
        NFSD_Getfs,
+#endif
        NFSD_List,
        NFSD_Export_features,
        NFSD_Fh,
@@ -54,6 +56,7 @@ enum {
 /*
 * write() for these nodes.
 */
+#ifdef CONFIG_NFSD_DEPRECATED
 static ssize_t write_svc(struct file *file, char *buf, size_t size);
 static ssize_t write_add(struct file *file, char *buf, size_t size);
 static ssize_t write_del(struct file *file, char *buf, size_t size);
@@ -61,6 +64,7 @@ static ssize_t write_export(struct file *file, char *buf, size_t size);
 static ssize_t write_unexport(struct file *file, char *buf, size_t size);
 static ssize_t write_getfd(struct file *file, char *buf, size_t size);
 static ssize_t write_getfs(struct file *file, char *buf, size_t size);
+#endif
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
 static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
 static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
@@ -76,6 +80,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
 static ssize_t (*write_op[])(struct file *, char *, size_t) = {
+#ifdef CONFIG_NFSD_DEPRECATED
        [NFSD_Svc] = write_svc,
        [NFSD_Add] = write_add,
        [NFSD_Del] = write_del,
@@ -83,6 +88,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_Unexport] = write_unexport,
        [NFSD_Getfd] = write_getfd,
        [NFSD_Getfs] = write_getfs,
+#endif
        [NFSD_Fh] = write_filehandle,
        [NFSD_FO_UnlockIP] = write_unlock_ip,
        [NFSD_FO_UnlockFS] = write_unlock_fs,
@@ -121,6 +127,14 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
 static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
 {
+        static int warned;
+        if (file->f_dentry->d_name.name[0] == '.' && !warned) {
+                printk(KERN_INFO
+                       "Warning: \"%s\" uses deprecated NFSD interface: %s."
+                       "  This will be removed in 2.6.40\n",
+                       current->comm, file->f_dentry->d_name.name);
+                warned = 1;
+        }
        if (! file->private_data) {
                /* An attempt to read a transaction file without writing
                 * causes a 0-byte write so that the file can return
@@ -137,6 +151,7 @@ static const struct file_operations transaction_ops = {
        .write          = nfsctl_transaction_write,
        .read           = nfsctl_transaction_read,
        .release        = simple_transaction_release,
+        .llseek         = default_llseek,
 };
 static int exports_open(struct inode *inode, struct file *file)
@@ -186,6 +201,7 @@ static const struct file_operations pool_stats_operations = {
 * payload - write methods
 */
+#ifdef CONFIG_NFSD_DEPRECATED
 /**
 * write_svc - Start kernel's NFSD server
 *
@@ -401,7 +417,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
        ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
-        clp = auth_unix_lookup(&in6);
+        clp = auth_unix_lookup(&init_net, &in6);
        if (!clp)
                err = -EPERM;
        else {
@@ -464,7 +480,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
        ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6);
-        clp = auth_unix_lookup(&in6);
+        clp = auth_unix_lookup(&init_net, &in6);
        if (!clp)
                err = -EPERM;
        else {
@@ -481,6 +497,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 out:
        return err;
 }
+#endif /* CONFIG_NFSD_DEPRECATED */
 /**
 * write_unlock_ip - Release all locks used by a client
@@ -999,12 +1016,12 @@ static ssize_t __write_ports_addxprt(char *buf)
        if (err != 0)
                return err;
-        err = svc_create_xprt(nfsd_serv, transport,
+        err = svc_create_xprt(nfsd_serv, transport, &init_net,
                                PF_INET, port, SVC_SOCK_ANONYMOUS);
        if (err < 0)
                goto out_err;
-        err = svc_create_xprt(nfsd_serv, transport,
+        err = svc_create_xprt(nfsd_serv, transport, &init_net,
                                PF_INET6, port, SVC_SOCK_ANONYMOUS);
        if (err < 0 && err != -EAFNOSUPPORT)
                goto out_close;
@@ -1355,6 +1372,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 {
        static struct tree_descr nfsd_files[] = {
+#ifdef CONFIG_NFSD_DEPRECATED
                [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR},
                [NFSD_Add] = {".add", &transaction_ops, S_IWUSR},
                [NFSD_Del] = {".del", &transaction_ops, S_IWUSR},
@@ -1362,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR},
                [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
+#endif
                [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
                [NFSD_Export_features] = {"export_features",
                                        &export_features_operations, S_IRUGO},
@@ -1386,16 +1405,16 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
        return simple_fill_super(sb, 0x6e667364, nfsd_files);
 }
-static int nfsd_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfsd_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt);
+        return mount_single(fs_type, flags, data, nfsd_fill_super);
 }
 static struct file_system_type nfsd_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfsd",
-        .get_sb         = nfsd_get_sb,
+        .mount          = nfsd_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index b76ac3a82e39..6b641cf2c19a 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -249,7 +249,7 @@ extern time_t nfsd4_grace;
 #define COMPOUND_SLACK_SPACE            140    /* OP_GETFH */
 #define COMPOUND_ERR_SLACK_SPACE        12     /* OP_SETATTR */
-#define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
+#define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
 /*
 * The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index e2c43464f237..2bae1d86f5f2 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -16,6 +16,7 @@
 #include <linux/lockd/bind.h>
 #include <linux/nfsacl.h>
 #include <linux/seq_file.h>
+#include <net/net_namespace.h>
 #include "nfsd.h"
 #include "cache.h"
 #include "vfs.h"
@@ -186,12 +187,12 @@ static int nfsd_init_socks(int port)
        if (!list_empty(&nfsd_serv->sv_permsocks))
                return 0;
-        error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
+        error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, port,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
-        error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
+        error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, port,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 322518c88e4b..39adc27b0685 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
 #ifndef _NFSD4_STATE_H
 #define _NFSD4_STATE_H
+#include <linux/sunrpc/svc_xprt.h>
 #include <linux/nfsd/nfsfh.h>
 #include "nfsfh.h"
@@ -64,19 +65,12 @@ typedef struct {
        (s)->si_fileid, \
        (s)->si_generation
-struct nfsd4_cb_sequence {
-        /* args/res */
-        u32                     cbs_minorversion;
-        struct nfs4_client      *cbs_clp;
-};
-struct nfs4_rpc_args {
-        void                            *args_op;
-        struct nfsd4_cb_sequence        args_seq;
-};
 struct nfsd4_callback {
-        struct nfs4_rpc_args cb_args;
+        void *cb_op;
+        struct nfs4_client *cb_clp;
+        u32 cb_minorversion;
+        struct rpc_message cb_msg;
+        const struct rpc_call_ops *cb_ops;
        struct work_struct cb_work;
 };
@@ -91,7 +85,6 @@ struct nfs4_delegation {
        u32                     dl_type;
        time_t                  dl_time;
 /* For recall: */
-        u32                     dl_ident;
        stateid_t               dl_stateid;
        struct knfsd_fh         dl_fh;
        int                     dl_retries;
@@ -103,8 +96,8 @@ struct nfs4_cb_conn {
        /* SETCLIENTID info */
        struct sockaddr_storage cb_addr;
        size_t                  cb_addrlen;
-        u32                     cb_prog;
+        u32                     cb_prog; /* used only in 4.0 case;
-        u32                     cb_minorversion;
+                                            per-session otherwise */
        u32                     cb_ident;       /* minorversion 0 only */
        struct svc_xprt         *cb_xprt;       /* minorversion 1 only */
 };
@@ -160,6 +153,15 @@ struct nfsd4_clid_slot {
        struct nfsd4_create_session     sl_cr_ses;
 };
+struct nfsd4_conn {
+        struct list_head cn_persession;
+        struct svc_xprt *cn_xprt;
+        struct svc_xpt_user cn_xpt_user;
+        struct nfsd4_session *cn_session;
+/* CDFC4_FORE, CDFC4_BACK: */
+        unsigned char cn_flags;
+};
 struct nfsd4_session {
        struct kref             se_ref;
        struct list_head        se_hash;        /* hash by sessionid */
@@ -169,6 +171,9 @@ struct nfsd4_session {
        struct nfs4_sessionid   se_sessionid;
        struct nfsd4_channel_attrs se_fchannel;
        struct nfsd4_channel_attrs se_bchannel;
+        struct list_head        se_conns;
+        u32                     se_cb_prog;
+        u32                     se_cb_seq_nr;
        struct nfsd4_slot       *se_slots[];    /* forward channel slots */
 };
@@ -221,24 +226,32 @@ struct nfs4_client {
        clientid_t              cl_clientid;    /* generated by server */
        nfs4_verifier           cl_confirm;     /* generated by server */
        u32                     cl_firststate;  /* recovery dir creation */
+        u32                     cl_minorversion;
        /* for v4.0 and v4.1 callbacks: */
        struct nfs4_cb_conn     cl_cb_conn;
+#define NFSD4_CLIENT_CB_UPDATE  1
+#define NFSD4_CLIENT_KILL       2
+        unsigned long           cl_cb_flags;
        struct rpc_clnt         *cl_cb_client;
+        u32                     cl_cb_ident;
        atomic_t                cl_cb_set;
+        struct nfsd4_callback   cl_cb_null;
+        struct nfsd4_session    *cl_cb_session;
+        /* for all client information that callback code might need: */
+        spinlock_t              cl_lock;
        /* for nfs41 */
        struct list_head        cl_sessions;
        struct nfsd4_clid_slot  cl_cs_slot;     /* create_session slot */
        u32                     cl_exchange_flags;
-        struct nfs4_sessionid   cl_sessionid;
        /* number of rpc's in progress over an associated session: */
        atomic_t                cl_refcount;
        /* for nfs41 callbacks */
        /* We currently support a single back channel with a single slot */
        unsigned long           cl_cb_slot_busy;
-        u32                     cl_cb_seq_nr;
        struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
                                                /* wait here for slots */
 };
@@ -440,12 +453,13 @@ extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
 extern void nfs4_free_stateowner(struct kref *kref);
 extern int set_callback_cred(void);
-extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
 extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
-extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *);
+extern void nfsd4_shutdown_callback(struct nfs4_client *);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
 extern void nfsd4_init_recdir(char *recdir_name);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 661a6cf8e826..184938fcff04 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -281,23 +281,13 @@ commit_metadata(struct svc_fh *fhp)
 {
        struct inode *inode = fhp->fh_dentry->d_inode;
        const struct export_operations *export_ops = inode->i_sb->s_export_op;
-        int error = 0;
        if (!EX_ISSYNC(fhp->fh_export))
                return 0;
-        if (export_ops->commit_metadata) {
+        if (export_ops->commit_metadata)
-                error = export_ops->commit_metadata(inode);
+                return export_ops->commit_metadata(inode);
-        } else {
+        return sync_inode_metadata(inode, 1);
-                struct writeback_control wbc = {
-                        .sync_mode = WB_SYNC_ALL,
-                        .nr_to_write = 0, /* metadata only */
-                };
-                error = sync_inode(inode, &wbc);
-        }
-        return error;
 }
 /*
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
index df3e62c1ddc5..85c98737a146 100644
--- a/fs/nilfs2/Makefile
+++ b/fs/nilfs2/Makefile
@@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o
 nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
        btnode.o bmap.o btree.o direct.o dat.o recovery.o \
        the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
-        ifile.o alloc.o gcinode.o ioctl.o gcdat.o
+        ifile.o alloc.o gcinode.o ioctl.o
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 3dbdc1d356bf..8b782b062baa 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -533,18 +533,20 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
        nilfs_btree_init_gc(bmap);
 }
-void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+void nilfs_bmap_save(const struct nilfs_bmap *bmap,
+                     struct nilfs_bmap_store *store)
 {
-        memcpy(gcbmap, bmap, sizeof(*bmap));
+        memcpy(store->data, bmap->b_u.u_data, sizeof(store->data));
-        init_rwsem(&gcbmap->b_sem);
+        store->last_allocated_key = bmap->b_last_allocated_key;
-        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+        store->last_allocated_ptr = bmap->b_last_allocated_ptr;
-        gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
+        store->state = bmap->b_state;
 }
-void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+void nilfs_bmap_restore(struct nilfs_bmap *bmap,
+                        const struct nilfs_bmap_store *store)
 {
-        memcpy(bmap, gcbmap, sizeof(*bmap));
+        memcpy(bmap->b_u.u_data, store->data, sizeof(store->data));
-        init_rwsem(&bmap->b_sem);
+        bmap->b_last_allocated_key = store->last_allocated_key;
-        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+        bmap->b_last_allocated_ptr = store->last_allocated_ptr;
-        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+        bmap->b_state = store->state;
 }
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index a20569b19929..bde1c0aa2e15 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -135,6 +135,12 @@ struct nilfs_bmap {
 /* state */
 #define NILFS_BMAP_DIRTY        0x00000001
+struct nilfs_bmap_store {
+        __le64 data[NILFS_BMAP_SIZE / sizeof(__le64)];
+        __u64 last_allocated_key;
+        __u64 last_allocated_ptr;
+        int state;
+};
 int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
@@ -153,9 +159,9 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
 int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
 void nilfs_bmap_init_gc(struct nilfs_bmap *);
-void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
-void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+void nilfs_bmap_save(const struct nilfs_bmap *, struct nilfs_bmap_store *);
+void nilfs_bmap_restore(struct nilfs_bmap *, const struct nilfs_bmap_store *);
 static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
                                    __u64 *ptr)
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index f78ab1044d1d..5115814cb745 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -37,15 +37,7 @@
 void nilfs_btnode_cache_init_once(struct address_space *btnc)
 {
-        memset(btnc, 0, sizeof(*btnc));
+        nilfs_mapping_init_once(btnc);
-        INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
-        spin_lock_init(&btnc->tree_lock);
-        INIT_LIST_HEAD(&btnc->private_list);
-        spin_lock_init(&btnc->private_lock);
-        spin_lock_init(&btnc->i_mmap_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
-        INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
 }
 static const struct address_space_operations def_btnode_aops = {
@@ -55,12 +47,7 @@ static const struct address_space_operations def_btnode_aops = {
 void nilfs_btnode_cache_init(struct address_space *btnc,
                             struct backing_dev_info *bdi)
 {
-        btnc->host = NULL;  /* can safely set to host inode ? */
+        nilfs_mapping_init(btnc, bdi, &def_btnode_aops);
-        btnc->flags = 0;
-        mapping_set_gfp_mask(btnc, GFP_NOFS);
-        btnc->assoc_mapping = NULL;
-        btnc->backing_dev_info = bdi;
-        btnc->a_ops = &def_btnode_aops;
 }
 void nilfs_btnode_cache_clear(struct address_space *btnc)
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 18737818db63..5ff15a8a1024 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -863,26 +863,19 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
 */
 int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
 {
-        struct the_nilfs *nilfs;
        int ret;
-        nilfs = NILFS_MDT(cpfile)->mi_nilfs;
        switch (mode) {
        case NILFS_CHECKPOINT:
-                /*
+                if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno))
-                 * Check for protecting existing snapshot mounts:
+                        /*
-                 * ns_mount_mutex is used to make this operation atomic and
+                         * Current implementation does not have to protect
-                 * exclusive with a new mount job.  Though it doesn't cover
+                         * plain read-only mounts since they are exclusive
-                 * umount, it's enough for the purpose.
+                         * with a read/write mount and are protected from the
-                 */
+                         * cleaner.
-                if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
+                         */
-                        /* Current implementation does not have to protect
-                           plain read-only mounts since they are exclusive
-                           with a read/write mount and are protected from the
-                           cleaner. */
                        ret = -EBUSY;
-                } else
+                else
                        ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
                return ret;
        case NILFS_SNAPSHOT:
@@ -933,27 +926,40 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
 }
 /**
- * nilfs_cpfile_read - read cpfile inode
+ * nilfs_cpfile_read - read or get cpfile inode
- * @cpfile: cpfile inode
+ * @sb: super block instance
- * @raw_inode: on-disk cpfile inode
- */
-int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
-{
-        return nilfs_read_inode_common(cpfile, raw_inode);
-}
-/**
- * nilfs_cpfile_new - create cpfile
- * @nilfs: nilfs object
 * @cpsize: size of a checkpoint entry
+ * @raw_inode: on-disk cpfile inode
+ * @inodep: buffer to store the inode
 */
-struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize)
+int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
+                      struct nilfs_inode *raw_inode, struct inode **inodep)
 {
        struct inode *cpfile;
+        int err;
+        cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
+        if (unlikely(!cpfile))
+                return -ENOMEM;
+        if (!(cpfile->i_state & I_NEW))
+                goto out;
+        err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0);
+        if (err)
+                goto failed;
-        cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0);
+        nilfs_mdt_set_entry_size(cpfile, cpsize,
-        if (cpfile)
+                                 sizeof(struct nilfs_cpfile_header));
-                nilfs_mdt_set_entry_size(cpfile, cpsize,
-                                         sizeof(struct nilfs_cpfile_header));
+        err = nilfs_read_inode_common(cpfile, raw_inode);
-        return cpfile;
+        if (err)
+                goto failed;
+        unlock_new_inode(cpfile);
+ out:
+        *inodep = cpfile;
+        return 0;
+ failed:
+        iget_failed(cpfile);
+        return err;
 }
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index bc0809e0ab43..a242b9a314f9 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -40,7 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
                                size_t);
-int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode);
+int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
-struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize);
+                      struct nilfs_inode *raw_inode, struct inode **inodep);
 #endif  /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 013146755683..49c844dab33a 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -36,6 +36,7 @@
 struct nilfs_dat_info {
        struct nilfs_mdt_info mi;
        struct nilfs_palloc_cache palloc_cache;
+        struct nilfs_shadow_map shadow;
 };
 static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
@@ -102,7 +103,8 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
        nilfs_palloc_abort_alloc_entry(dat, req);
 }
-void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
+static void nilfs_dat_commit_free(struct inode *dat,
+                                  struct nilfs_palloc_req *req)
 {
        struct nilfs_dat_entry *entry;
        void *kaddr;
@@ -327,6 +329,23 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
        ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
        if (ret < 0)
                return ret;
+        /*
+         * The given disk block number (blocknr) is not yet written to
+         * the device at this point.
+         *
+         * To prevent nilfs_dat_translate() from returning the
+         * uncommited block number, this makes a copy of the entry
+         * buffer and redirects nilfs_dat_translate() to the copy.
+         */
+        if (!buffer_nilfs_redirected(entry_bh)) {
+                ret = nilfs_mdt_freeze_buffer(dat, entry_bh);
+                if (ret) {
+                        brelse(entry_bh);
+                        return ret;
+                }
+        }
        kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
        entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
        if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
@@ -371,7 +390,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
 */
 int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 {
-        struct buffer_head *entry_bh;
+        struct buffer_head *entry_bh, *bh;
        struct nilfs_dat_entry *entry;
        sector_t blocknr;
        void *kaddr;
@@ -381,6 +400,15 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
        if (ret < 0)
                return ret;
+        if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) {
+                bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh);
+                if (bh) {
+                        WARN_ON(!buffer_uptodate(bh));
+                        brelse(entry_bh);
+                        entry_bh = bh;
+                }
+        }
        kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
        entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
        blocknr = le64_to_cpu(entry->de_blocknr);
@@ -436,38 +464,48 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
 }
 /**
- * nilfs_dat_read - read dat inode
+ * nilfs_dat_read - read or get dat inode
- * @dat: dat inode
+ * @sb: super block instance
- * @raw_inode: on-disk dat inode
- */
-int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
-{
-        return nilfs_read_inode_common(dat, raw_inode);
-}
-/**
- * nilfs_dat_new - create dat file
- * @nilfs: nilfs object
 * @entry_size: size of a dat entry
+ * @raw_inode: on-disk dat inode
+ * @inodep: buffer to store the inode
 */
-struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size)
+int nilfs_dat_read(struct super_block *sb, size_t entry_size,
+                   struct nilfs_inode *raw_inode, struct inode **inodep)
 {
        static struct lock_class_key dat_lock_key;
        struct inode *dat;
        struct nilfs_dat_info *di;
        int err;
-        dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di));
+        dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
-        if (dat) {
+        if (unlikely(!dat))
-                err = nilfs_palloc_init_blockgroup(dat, entry_size);
+                return -ENOMEM;
-                if (unlikely(err)) {
+        if (!(dat->i_state & I_NEW))
-                        nilfs_mdt_destroy(dat);
+                goto out;
-                        return NULL;
-                }
-                di = NILFS_DAT_I(dat);
+        err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di));
-                lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
+        if (err)
-                nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+                goto failed;
-        }
-        return dat;
+        err = nilfs_palloc_init_blockgroup(dat, entry_size);
+        if (err)
+                goto failed;
+        di = NILFS_DAT_I(dat);
+        lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
+        nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+        nilfs_mdt_setup_shadow_map(dat, &di->shadow);
+        err = nilfs_read_inode_common(dat, raw_inode);
+        if (err)
+                goto failed;
+        unlock_new_inode(dat);
+ out:
+        *inodep = dat;
+        return 0;
+ failed:
+        iget_failed(dat);
+        return err;
 }
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d31c3aab0efe..cbd8e9732503 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -53,7 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t);
 int nilfs_dat_move(struct inode *, __u64, sector_t);
 ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
-int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode);
+int nilfs_dat_read(struct super_block *sb, size_t entry_size,
-struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size);
+                   struct nilfs_inode *raw_inode, struct inode **inodep);
 #endif  /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h
new file mode 100644
index 000000000000..a71cc412b651
--- /dev/null
+++ b/fs/nilfs2/export.h
@@ -0,0 +1,17 @@
+#ifndef NILFS_EXPORT_H
+#define NILFS_EXPORT_H
+#include <linux/exportfs.h>
+extern const struct export_operations nilfs_export_ops;
+struct nilfs_fid {
+        u64 cno;
+        u64 ino;
+        u32 gen;
+        u32 parent_gen;
+        u64 parent_ino;
+} __attribute__ ((packed));
+#endif
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
deleted file mode 100644
index 84a45d1d5464..000000000000
--- a/fs/nilfs2/gcdat.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * gcdat.c - NILFS shadow DAT inode for GC
- *
- * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
- *            and Ryusuke Konishi <ryusuke@osrg.net>.
- *
- */
-#include <linux/buffer_head.h>
-#include "nilfs.h"
-#include "page.h"
-#include "mdt.h"
-int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
-{
-        struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
-        struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
-        int err;
-        gcdat->i_state = 0;
-        gcdat->i_blocks = dat->i_blocks;
-        gii->i_flags = dii->i_flags;
-        gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
-        gii->i_cno = 0;
-        nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
-        err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
-        if (unlikely(err))
-                return err;
-        return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
-                                      &dii->i_btnode_cache);
-}
-void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
-{
-        struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
-        struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
-        struct address_space *mapping = dat->i_mapping;
-        struct address_space *gmapping = gcdat->i_mapping;
-        down_write(&NILFS_MDT(dat)->mi_sem);
-        dat->i_blocks = gcdat->i_blocks;
-        dii->i_flags = gii->i_flags;
-        dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
-        nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
-        nilfs_palloc_clear_cache(dat);
-        nilfs_palloc_clear_cache(gcdat);
-        nilfs_clear_dirty_pages(mapping);
-        nilfs_copy_back_pages(mapping, gmapping);
-        /* note: mdt dirty flags should be cleared by segctor. */
-        nilfs_clear_dirty_pages(&dii->i_btnode_cache);
-        nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
-        up_write(&NILFS_MDT(dat)->mi_sem);
-}
-void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
-{
-        struct inode *gcdat = nilfs->ns_gc_dat;
-        struct nilfs_inode_info *gii = NILFS_I(gcdat);
-        gcdat->i_state = I_FREEING | I_CLEAR;
-        gii->i_flags = 0;
-        nilfs_palloc_clear_cache(gcdat);
-        truncate_inode_pages(gcdat->i_mapping, 0);
-        truncate_inode_pages(&gii->i_btnode_cache, 0);
-}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index bed3a783129b..33ad25ddd5c4 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,13 +28,6 @@
 * gcinodes), and this file provides lookup function of the dummy
 * inodes and their buffer read function.
 *
- * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
- * has to treat blocks that belong to a same file but have different
- * checkpoint numbers.  To avoid interference among generations, dummy
- * inodes are managed separately from actual inodes, and their lookup
- * function (nilfs_gc_iget) is designed to be specified with a
- * checkpoint number argument as well as an inode number.
- *
 * Buffers and pages held by the dummy inodes will be released each
 * time after they are copied to a new log.  Dirty blocks made on the
 * current generation and the blocks to be moved by GC never overlap
@@ -175,125 +168,46 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
                }
                nilfs_btnode_mark_dirty(bh);
        } else {
-                nilfs_mdt_mark_buffer_dirty(bh);
+                nilfs_mark_buffer_dirty(bh);
        }
        return 0;
 }
-/*
+int nilfs_init_gcinode(struct inode *inode)
- * nilfs_init_gccache() - allocate and initialize gc_inode hash table
- * @nilfs - the_nilfs
- *
- * Return Value: On success, 0.
- * On error, a negative error code is returned.
- */
-int nilfs_init_gccache(struct the_nilfs *nilfs)
 {
-        int loop;
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
-        BUG_ON(nilfs->ns_gc_inodes_h);
-        INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
-        nilfs->ns_gc_inodes_h =
-                kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
-                        GFP_NOFS);
-        if (nilfs->ns_gc_inodes_h == NULL)
-                return -ENOMEM;
-        for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
-                INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
-        return 0;
-}
-/*
- * nilfs_destroy_gccache() - free gc_inode hash table
- * @nilfs - the nilfs
- */
-void nilfs_destroy_gccache(struct the_nilfs *nilfs)
-{
-        if (nilfs->ns_gc_inodes_h) {
-                nilfs_remove_all_gcinode(nilfs);
-                kfree(nilfs->ns_gc_inodes_h);
-                nilfs->ns_gc_inodes_h = NULL;
-        }
-}
-static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
-                                   __u64 cno)
-{
-        struct inode *inode;
-        struct nilfs_inode_info *ii;
-        inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
-        if (!inode)
-                return NULL;
-        inode->i_op = NULL;
+        inode->i_mode = S_IFREG;
-        inode->i_fop = NULL;
+        mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
        inode->i_mapping->a_ops = &def_gcinode_aops;
+        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
-        ii = NILFS_I(inode);
-        ii->i_cno = cno;
        ii->i_flags = 0;
-        ii->i_state = 1 << NILFS_I_GCINODE;
-        ii->i_bh = NULL;
        nilfs_bmap_init_gc(ii->i_bmap);
-        return inode;
+        /*
-}
+         * Add the inode to GC inode list. Garbage Collection
+         * is serialized and no two processes manipulate the
-static unsigned long ihash(ino_t ino, __u64 cno)
+         * list simultaneously.
-{
+         */
-        return hash_long((unsigned long)((ino << 2) + cno),
+        igrab(inode);
-                         NILFS_GCINODE_HASH_BITS);
+        list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
-}
-/*
- * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
- */
-struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
-{
-        struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
-        struct hlist_node *node;
-        struct inode *inode;
-        hlist_for_each_entry(inode, node, head, i_hash) {
-                if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
-                        return inode;
-        }
-        inode = alloc_gcinode(nilfs, ino, cno);
+        return 0;
-        if (likely(inode)) {
-                hlist_add_head(&inode->i_hash, head);
-                list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
-        }
-        return inode;
-}
-/*
- * nilfs_clear_gcinode() - clear and free a gc inode
- */
-void nilfs_clear_gcinode(struct inode *inode)
-{
-        nilfs_mdt_destroy(inode);
 }
-/*
+/**
- * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs
+ * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes
 */
-void nilfs_remove_all_gcinode(struct the_nilfs *nilfs)
+void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
 {
-        struct hlist_head *head = nilfs->ns_gc_inodes_h;
+        struct list_head *head = &nilfs->ns_gc_inodes;
-        struct hlist_node *node, *n;
+        struct nilfs_inode_info *ii;
-        struct inode *inode;
-        int loop;
-        for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) {
+        while (!list_empty(head)) {
-                hlist_for_each_entry_safe(inode, node, n, head, i_hash) {
+                ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
-                        hlist_del_init(&inode->i_hash);
+                list_del_init(&ii->i_dirty);
-                        list_del_init(&NILFS_I(inode)->i_dirty);
+                iput(&ii->vfs_inode);
-                        nilfs_clear_gcinode(inode); /* might sleep */
-                }
        }
 }
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 922d9dd42c8f..9f8a2da67f90 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -161,25 +161,46 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
 }
 /**
- * nilfs_ifile_new - create inode file
+ * nilfs_ifile_read - read or get ifile inode
- * @sbi: nilfs_sb_info struct
+ * @sb: super block instance
+ * @root: root object
 * @inode_size: size of an inode
+ * @raw_inode: on-disk ifile inode
+ * @inodep: buffer to store the inode
 */
-struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size)
+int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
+                     size_t inode_size, struct nilfs_inode *raw_inode,
+                     struct inode **inodep)
 {
        struct inode *ifile;
        int err;
-        ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO,
+        ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO);
-                              sizeof(struct nilfs_ifile_info));
+        if (unlikely(!ifile))
-        if (ifile) {
+                return -ENOMEM;
-                err = nilfs_palloc_init_blockgroup(ifile, inode_size);
+        if (!(ifile->i_state & I_NEW))
-                if (unlikely(err)) {
+                goto out;
-                        nilfs_mdt_destroy(ifile);
-                        return NULL;
+        err = nilfs_mdt_init(ifile, NILFS_MDT_GFP,
-                }
+                             sizeof(struct nilfs_ifile_info));
-                nilfs_palloc_setup_cache(ifile,
+        if (err)
-                                         &NILFS_IFILE_I(ifile)->palloc_cache);
+                goto failed;
-        }
-        return ifile;
+        err = nilfs_palloc_init_blockgroup(ifile, inode_size);
+        if (err)
+                goto failed;
+        nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
+        err = nilfs_read_inode_common(ifile, raw_inode);
+        if (err)
+                goto failed;
+        unlock_new_inode(ifile);
+ out:
+        *inodep = ifile;
+        return 0;
+ failed:
+        iget_failed(ifile);
+        return err;
 }
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index cbca32e498f2..59b6f2b51df6 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
 int nilfs_ifile_delete_inode(struct inode *, ino_t);
 int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
-struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size);
+int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
+                     size_t inode_size, struct nilfs_inode *raw_inode,
+                     struct inode **inodep);
 #endif  /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index eccb2f2e2315..71d4bc8464e0 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -34,6 +34,12 @@
 #include "cpfile.h"
 #include "ifile.h"
+struct nilfs_iget_args {
+        u64 ino;
+        __u64 cno;
+        struct nilfs_root *root;
+        int for_gc;
+};
 /**
 * nilfs_get_block() - get a file block on the filesystem (callback function)
@@ -279,6 +285,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct inode *inode;
        struct nilfs_inode_info *ii;
+        struct nilfs_root *root;
        int err = -ENOMEM;
        ino_t ino;
@@ -289,15 +296,17 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
        mapping_set_gfp_mask(inode->i_mapping,
                             mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+        root = NILFS_I(dir)->i_root;
        ii = NILFS_I(inode);
        ii->i_state = 1 << NILFS_I_NEW;
+        ii->i_root = root;
-        err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh);
+        err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
        if (unlikely(err))
                goto failed_ifile_create_inode;
        /* reference count of i_bh inherits from nilfs_mdt_read_block() */
-        atomic_inc(&sbi->s_inodes_count);
+        atomic_inc(&root->inodes_count);
        inode_init_owner(inode, dir, mode);
        inode->i_ino = ino;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -320,7 +329,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
        /* ii->i_file_acl = 0; */
        /* ii->i_dir_acl = 0; */
        ii->i_dir_start_lookup = 0;
-        ii->i_cno = 0;
        nilfs_set_inode_flags(inode);
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
@@ -350,16 +358,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
        return ERR_PTR(err);
 }
-void nilfs_free_inode(struct inode *inode)
-{
-        struct super_block *sb = inode->i_sb;
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
-        /* XXX: check error code? Is there any thing I can do? */
-        (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
-        atomic_dec(&sbi->s_inodes_count);
-}
 void nilfs_set_inode_flags(struct inode *inode)
 {
        unsigned int flags = NILFS_I(inode)->i_flags;
@@ -410,7 +408,6 @@ int nilfs_read_inode_common(struct inode *inode,
                0 : le32_to_cpu(raw_inode->i_dir_acl);
 #endif
        ii->i_dir_start_lookup = 0;
-        ii->i_cno = 0;
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@ -424,7 +421,8 @@ int nilfs_read_inode_common(struct inode *inode,
        return 0;
 }
-static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
+static int __nilfs_read_inode(struct super_block *sb,
+                              struct nilfs_root *root, unsigned long ino,
                              struct inode *inode)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -434,11 +432,11 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
        int err;
        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
-        err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh);
+        err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
        if (unlikely(err))
                goto bad_inode;
-        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
+        raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
        err = nilfs_read_inode_common(inode, raw_inode);
        if (err)
@@ -461,14 +459,14 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
                        inode, inode->i_mode,
                        huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
        }
-        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+        nilfs_ifile_unmap_inode(root->ifile, ino, bh);
        brelse(bh);
        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
        nilfs_set_inode_flags(inode);
        return 0;
 failed_unmap:
-        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+        nilfs_ifile_unmap_inode(root->ifile, ino, bh);
        brelse(bh);
 bad_inode:
@@ -476,18 +474,95 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
        return err;
 }
-struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
+static int nilfs_iget_test(struct inode *inode, void *opaque)
+{
+        struct nilfs_iget_args *args = opaque;
+        struct nilfs_inode_info *ii;
+        if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
+                return 0;
+        ii = NILFS_I(inode);
+        if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
+                return !args->for_gc;
+        return args->for_gc && args->cno == ii->i_cno;
+}
+static int nilfs_iget_set(struct inode *inode, void *opaque)
+{
+        struct nilfs_iget_args *args = opaque;
+        inode->i_ino = args->ino;
+        if (args->for_gc) {
+                NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
+                NILFS_I(inode)->i_cno = args->cno;
+                NILFS_I(inode)->i_root = NULL;
+        } else {
+                if (args->root && args->ino == NILFS_ROOT_INO)
+                        nilfs_get_root(args->root);
+                NILFS_I(inode)->i_root = args->root;
+        }
+        return 0;
+}
+struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
+                            unsigned long ino)
+{
+        struct nilfs_iget_args args = {
+                .ino = ino, .root = root, .cno = 0, .for_gc = 0
+        };
+        return ilookup5(sb, ino, nilfs_iget_test, &args);
+}
+struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
+                                unsigned long ino)
+{
+        struct nilfs_iget_args args = {
+                .ino = ino, .root = root, .cno = 0, .for_gc = 0
+        };
+        return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
+}
+struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
+                         unsigned long ino)
 {
        struct inode *inode;
        int err;
-        inode = iget_locked(sb, ino);
+        inode = nilfs_iget_locked(sb, root, ino);
        if (unlikely(!inode))
                return ERR_PTR(-ENOMEM);
        if (!(inode->i_state & I_NEW))
                return inode;
-        err = __nilfs_read_inode(sb, ino, inode);
+        err = __nilfs_read_inode(sb, root, ino, inode);
+        if (unlikely(err)) {
+                iget_failed(inode);
+                return ERR_PTR(err);
+        }
+        unlock_new_inode(inode);
+        return inode;
+}
+struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
+                                __u64 cno)
+{
+        struct nilfs_iget_args args = {
+                .ino = ino, .root = NULL, .cno = cno, .for_gc = 1
+        };
+        struct inode *inode;
+        int err;
+        inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
+        if (unlikely(!inode))
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        err = nilfs_init_gcinode(inode);
        if (unlikely(err)) {
                iget_failed(inode);
                return ERR_PTR(err);
@@ -528,21 +603,20 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
 {
        ino_t ino = inode->i_ino;
        struct nilfs_inode_info *ii = NILFS_I(inode);
-        struct super_block *sb = inode->i_sb;
+        struct inode *ifile = ii->i_root->ifile;
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct nilfs_inode *raw_inode;
-        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
+        raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
        if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
-                memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
+                memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
        set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
        nilfs_write_inode_common(inode, raw_inode, 0);
                /* XXX: call with has_bmap = 0 is a workaround to avoid
                   deadlock of bmap. This delays update of i_bmap to just
                   before writing */
-        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh);
+        nilfs_ifile_unmap_inode(ifile, ino, ibh);
 }
 #define NILFS_MAX_TRUNCATE_BLOCKS       16384  /* 64MB for 4KB block */
@@ -617,6 +691,7 @@ void nilfs_truncate(struct inode *inode)
 static void nilfs_clear_inode(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
        /*
         * Free resources allocated in nilfs_read_inode(), here.
@@ -625,10 +700,16 @@ static void nilfs_clear_inode(struct inode *inode)
        brelse(ii->i_bh);
        ii->i_bh = NULL;
+        if (mdi && mdi->mi_palloc_cache)
+                nilfs_palloc_destroy_cache(inode);
        if (test_bit(NILFS_I_BMAP, &ii->i_state))
                nilfs_bmap_clear(ii->i_bmap);
        nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+        if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
+                nilfs_put_root(ii->i_root);
 }
 void nilfs_evict_inode(struct inode *inode)
@@ -637,7 +718,7 @@ void nilfs_evict_inode(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        struct nilfs_inode_info *ii = NILFS_I(inode);
-        if (inode->i_nlink || unlikely(is_bad_inode(inode))) {
+        if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
                if (inode->i_data.nrpages)
                        truncate_inode_pages(&inode->i_data, 0);
                end_writeback(inode);
@@ -649,12 +730,16 @@ void nilfs_evict_inode(struct inode *inode)
        if (inode->i_data.nrpages)
                truncate_inode_pages(&inode->i_data, 0);
+        /* TODO: some of the following operations may fail.  */
        nilfs_truncate_bmap(ii, 0);
        nilfs_mark_inode_dirty(inode);
        end_writeback(inode);
+        nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
+        atomic_dec(&ii->i_root->inodes_count);
        nilfs_clear_inode(inode);
-        nilfs_free_inode(inode);
-        /* nilfs_free_inode() marks inode buffer dirty */
        if (IS_SYNC(inode))
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
        nilfs_transaction_commit(sb);
@@ -700,6 +785,17 @@ out_err:
        return err;
 }
+int nilfs_permission(struct inode *inode, int mask)
+{
+        struct nilfs_root *root = NILFS_I(inode)->i_root;
+        if ((mask & MAY_WRITE) && root &&
+            root->cno != NILFS_CPTREE_CURRENT_CNO)
+                return -EROFS; /* snapshot is not writable */
+        return generic_permission(inode, mask, NULL);
+}
 int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
                           struct buffer_head **pbh)
 {
@@ -709,8 +805,8 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
        spin_lock(&sbi->s_inode_lock);
        if (ii->i_bh == NULL) {
                spin_unlock(&sbi->s_inode_lock);
-                err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
+                err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
-                                                  pbh);
+                                                  inode->i_ino, pbh);
                if (unlikely(err))
                        return err;
                spin_lock(&sbi->s_inode_lock);
@@ -790,7 +886,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
        }
        nilfs_update_inode(inode, ibh);
        nilfs_mdt_mark_buffer_dirty(ibh);
-        nilfs_mdt_mark_dirty(sbi->s_ifile);
+        nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
        brelse(ibh);
        return 0;
 }
@@ -808,6 +904,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
 void nilfs_dirty_inode(struct inode *inode)
 {
        struct nilfs_transaction_info ti;
+        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
        if (is_bad_inode(inode)) {
                nilfs_warning(inode->i_sb, __func__,
@@ -815,6 +912,10 @@ void nilfs_dirty_inode(struct inode *inode)
                dump_stack();
                return;
        }
+        if (mdi) {
+                nilfs_mdt_mark_dirty(inode);
+                return;
+        }
        nilfs_transaction_begin(inode->i_sb, &ti, 0);
        nilfs_mark_inode_dirty(inode);
        nilfs_transaction_commit(inode->i_sb); /* never fails */
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f90a33d9a5b0..3e90f86d5bfe 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -22,7 +22,6 @@
 #include <linux/fs.h>
 #include <linux/wait.h>
-#include <linux/smp_lock.h>     /* lock_kernel(), unlock_kernel() */
 #include <linux/slab.h>
 #include <linux/capability.h>   /* capable() */
 #include <linux/uaccess.h>      /* copy_from_user(), copy_to_user() */
@@ -118,7 +117,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
        if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
                goto out;
-        mutex_lock(&nilfs->ns_mount_mutex);
+        down_read(&inode->i_sb->s_umount);
        nilfs_transaction_begin(inode->i_sb, &ti, 0);
        ret = nilfs_cpfile_change_cpmode(
@@ -128,7 +127,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
        else
                nilfs_transaction_commit(inode->i_sb); /* never fails */
-        mutex_unlock(&nilfs->ns_mount_mutex);
+        up_read(&inode->i_sb->s_umount);
 out:
        mnt_drop_write(filp->f_path.mnt);
        return ret;
@@ -334,7 +333,7 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
        return 0;
 }
-static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
+static int nilfs_ioctl_move_blocks(struct super_block *sb,
                                   struct nilfs_argv *argv, void *buf)
 {
        size_t nmembs = argv->v_nmembs;
@@ -349,7 +348,7 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
        for (i = 0, vdesc = buf; i < nmembs; ) {
                ino = vdesc->vd_ino;
                cno = vdesc->vd_cno;
-                inode = nilfs_gc_iget(nilfs, ino, cno);
+                inode = nilfs_iget_for_gc(sb, ino, cno);
                if (unlikely(inode == NULL)) {
                        ret = -ENOMEM;
                        goto failed;
@@ -357,11 +356,15 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
                do {
                        ret = nilfs_ioctl_move_inode_block(inode, vdesc,
                                                           &buffers);
-                        if (unlikely(ret < 0))
+                        if (unlikely(ret < 0)) {
+                                iput(inode);
                                goto failed;
+                        }
                        vdesc++;
                } while (++i < nmembs &&
                         vdesc->vd_ino == ino && vdesc->vd_cno == cno);
+                iput(inode); /* The inode still remains in GC inode list */
        }
        list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
@@ -567,7 +570,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
        }
        /*
-         * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(),
+         * nilfs_ioctl_move_blocks() will call nilfs_iget_for_gc(),
         * which will operates an inode list without blocking.
         * To protect the list from concurrent operations,
         * nilfs_ioctl_move_blocks should be atomic operation.
@@ -577,15 +580,16 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
                goto out_free;
        }
-        ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+        ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
        if (ret < 0)
                printk(KERN_ERR "NILFS: GC failed during preparation: "
                        "cannot read source blocks: err=%d\n", ret);
        else
                ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
-        if (ret < 0)
+        nilfs_remove_all_gcinodes(nilfs);
-                nilfs_remove_all_gcinode(nilfs);
        clear_nilfs_gc_running(nilfs);
 out_free:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index d01aff4957d9..39a5b84e2c9f 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -36,7 +36,6 @@
 #define NILFS_MDT_MAX_RA_BLOCKS         (16 - 1)
-#define INIT_UNUSED_INODE_FIELDS
 static int
 nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
@@ -78,25 +77,11 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
                                                     struct buffer_head *,
                                                     void *))
 {
-        struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
        struct super_block *sb = inode->i_sb;
        struct nilfs_transaction_info ti;
        struct buffer_head *bh;
        int err;
-        if (!sb) {
-                /*
-                 * Make sure this function is not called from any
-                 * read-only context.
-                 */
-                if (!nilfs->ns_writer) {
-                        WARN_ON(1);
-                        err = -EROFS;
-                        goto out;
-                }
-                sb = nilfs->ns_writer->s_super;
-        }
        nilfs_transaction_begin(sb, &ti, 0);
        err = -ENOMEM;
@@ -112,7 +97,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
        if (buffer_uptodate(bh))
                goto failed_bh;
-        bh->b_bdev = nilfs->ns_bdev;
+        bh->b_bdev = sb->s_bdev;
        err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
        if (likely(!err)) {
                get_bh(bh);
@@ -129,7 +114,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
                err = nilfs_transaction_commit(sb);
        else
                nilfs_transaction_abort(sb);
- out:
        return err;
 }
@@ -167,9 +152,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
                unlock_buffer(bh);
                goto failed_bh;
        }
-        bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
+        map_bh(bh, inode->i_sb, (sector_t)blknum);
-        bh->b_blocknr = (sector_t)blknum;
-        set_buffer_mapped(bh);
        bh->b_end_io = end_buffer_read_sync;
        get_bh(bh);
@@ -398,35 +381,24 @@ int nilfs_mdt_fetch_dirty(struct inode *inode)
 static int
 nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 {
-        struct inode *inode = container_of(page->mapping,
+        struct inode *inode;
-                                           struct inode, i_data);
+        struct super_block *sb;
-        struct super_block *sb = inode->i_sb;
-        struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
-        struct nilfs_sb_info *writer = NULL;
        int err = 0;
        redirty_page_for_writepage(wbc, page);
        unlock_page(page);
-        if (page->mapping->assoc_mapping)
+        inode = page->mapping->host;
-                return 0; /* Do not request flush for shadow page cache */
+        if (!inode)
-        if (!sb) {
+                return 0;
-                down_read(&nilfs->ns_writer_sem);
-                writer = nilfs->ns_writer;
+        sb = inode->i_sb;
-                if (!writer) {
-                        up_read(&nilfs->ns_writer_sem);
-                        return -EROFS;
-                }
-                sb = writer->s_super;
-        }
        if (wbc->sync_mode == WB_SYNC_ALL)
                err = nilfs_construct_segment(sb);
        else if (wbc->for_reclaim)
                nilfs_flush_segment(sb, inode->i_ino);
-        if (writer)
-                up_read(&nilfs->ns_writer_sem);
        return err;
 }
@@ -439,105 +411,27 @@ static const struct address_space_operations def_mdt_aops = {
 static const struct inode_operations def_mdt_iops;
 static const struct file_operations def_mdt_fops;
-/*
- * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
+int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
- * ifile, or gcinodes.  This allows the B-tree code and segment constructor
- * to treat them like regular files, and this helps to simplify the
- * implementation.
- *   On the other hand, some of the pseudo inodes have an irregular point:
- * They don't have valid inode->i_sb pointer because their lifetimes are
- * longer than those of the super block structs; they may continue for
- * several consecutive mounts/umounts.  This would need discussions.
- */
-/**
- * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
- * @nilfs: nilfs object
- * @sb: super block instance the metadata file belongs to
- * @ino: inode number
- * @gfp_mask: gfp mask for data pages
- * @objsz: size of the private object attached to inode->i_private
- */
-struct inode *
-nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
-                     ino_t ino, gfp_t gfp_mask, size_t objsz)
 {
-        struct inode *inode = nilfs_alloc_inode_common(nilfs);
+        struct nilfs_mdt_info *mi;
-        if (!inode)
+        mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
-                return NULL;
+        if (!mi)
-        else {
+                return -ENOMEM;
-                struct address_space * const mapping = &inode->i_data;
-                struct nilfs_mdt_info *mi;
-                mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
-                if (!mi) {
-                        nilfs_destroy_inode(inode);
-                        return NULL;
-                }
-                mi->mi_nilfs = nilfs;
-                init_rwsem(&mi->mi_sem);
-                inode->i_sb = sb; /* sb may be NULL for some meta data files */
-                inode->i_blkbits = nilfs->ns_blocksize_bits;
-                inode->i_flags = 0;
-                atomic_set(&inode->i_count, 1);
-                inode->i_nlink = 1;
-                inode->i_ino = ino;
-                inode->i_mode = S_IFREG;
-                inode->i_private = mi;
-#ifdef INIT_UNUSED_INODE_FIELDS
-                atomic_set(&inode->i_writecount, 0);
-                inode->i_size = 0;
-                inode->i_blocks = 0;
-                inode->i_bytes = 0;
-                inode->i_generation = 0;
-#ifdef CONFIG_QUOTA
-                memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
-#endif
-                inode->i_pipe = NULL;
-                inode->i_bdev = NULL;
-                inode->i_cdev = NULL;
-                inode->i_rdev = 0;
-#ifdef CONFIG_SECURITY
-                inode->i_security = NULL;
-#endif
-                inode->dirtied_when = 0;
-                INIT_LIST_HEAD(&inode->i_list);
-                INIT_LIST_HEAD(&inode->i_sb_list);
-                inode->i_state = 0;
-#endif
-                spin_lock_init(&inode->i_lock);
-                mutex_init(&inode->i_mutex);
-                init_rwsem(&inode->i_alloc_sem);
-                mapping->host = NULL;  /* instead of inode */
-                mapping->flags = 0;
-                mapping_set_gfp_mask(mapping, gfp_mask);
-                mapping->assoc_mapping = NULL;
-                mapping->backing_dev_info = nilfs->ns_bdi;
-                inode->i_mapping = mapping;
-        }
-        return inode;
+        init_rwsem(&mi->mi_sem);
-}
+        inode->i_private = mi;
-struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
+        inode->i_mode = S_IFREG;
-                            ino_t ino, size_t objsz)
+        mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
-{
+        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
-        struct inode *inode;
-        inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz);
-        if (!inode)
-                return NULL;
        inode->i_op = &def_mdt_iops;
        inode->i_fop = &def_mdt_fops;
        inode->i_mapping->a_ops = &def_mdt_aops;
-        return inode;
+        return 0;
 }
 void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
@@ -550,34 +444,159 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
        mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
 }
-void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
+static const struct address_space_operations shadow_map_aops = {
+        .sync_page              = block_sync_page,
+};
+/**
+ * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
+ * @inode: inode of the metadata file
+ * @shadow: shadow mapping
+ */
+int nilfs_mdt_setup_shadow_map(struct inode *inode,
+                               struct nilfs_shadow_map *shadow)
 {
-        shadow->i_mapping->assoc_mapping = orig->i_mapping;
+        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
-        NILFS_I(shadow)->i_btnode_cache.assoc_mapping =
+        struct backing_dev_info *bdi = inode->i_sb->s_bdi;
-                &NILFS_I(orig)->i_btnode_cache;
+        INIT_LIST_HEAD(&shadow->frozen_buffers);
+        nilfs_mapping_init_once(&shadow->frozen_data);
+        nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
+        nilfs_mapping_init_once(&shadow->frozen_btnodes);
+        nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
+        mi->mi_shadow = shadow;
+        return 0;
 }
-static void nilfs_mdt_clear(struct inode *inode)
+/**
+ * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map
+ * @inode: inode of the metadata file
+ */
+int nilfs_mdt_save_to_shadow_map(struct inode *inode)
 {
+        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct nilfs_shadow_map *shadow = mi->mi_shadow;
+        int ret;
-        invalidate_mapping_pages(inode->i_mapping, 0, -1);
+        ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
-        truncate_inode_pages(inode->i_mapping, 0);
+        if (ret)
+                goto out;
+        ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
+                                     &ii->i_btnode_cache);
+        if (ret)
+                goto out;
-        if (test_bit(NILFS_I_BMAP, &ii->i_state))
+        nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store);
-                nilfs_bmap_clear(ii->i_bmap);
+ out:
-        nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+        return ret;
 }
-void nilfs_mdt_destroy(struct inode *inode)
+int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
 {
-        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+        struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
+        struct buffer_head *bh_frozen;
+        struct page *page;
+        int blkbits = inode->i_blkbits;
+        int ret = -ENOMEM;
+        page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
+        if (!page)
+                return ret;
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, 1 << blkbits, 0);
+        bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
+        if (bh_frozen) {
+                if (!buffer_uptodate(bh_frozen))
+                        nilfs_copy_buffer(bh_frozen, bh);
+                if (list_empty(&bh_frozen->b_assoc_buffers)) {
+                        list_add_tail(&bh_frozen->b_assoc_buffers,
+                                      &shadow->frozen_buffers);
+                        set_buffer_nilfs_redirected(bh);
+                } else {
+                        brelse(bh_frozen); /* already frozen */
+                }
+                ret = 0;
+        }
+        unlock_page(page);
+        page_cache_release(page);
+        return ret;
+}
+struct buffer_head *
+nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
+{
+        struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
+        struct buffer_head *bh_frozen = NULL;
+        struct page *page;
+        int n;
+        page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
+        if (page) {
+                if (page_has_buffers(page)) {
+                        n = bh_offset(bh) >> inode->i_blkbits;
+                        bh_frozen = nilfs_page_get_nth_block(page, n);
+                }
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        return bh_frozen;
+}
+static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow)
+{
+        struct list_head *head = &shadow->frozen_buffers;
+        struct buffer_head *bh;
+        while (!list_empty(head)) {
+                bh = list_first_entry(head, struct buffer_head,
+                                      b_assoc_buffers);
+                list_del_init(&bh->b_assoc_buffers);
+                brelse(bh); /* drop ref-count to make it releasable */
+        }
+}
+/**
+ * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
+{
+        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct nilfs_shadow_map *shadow = mi->mi_shadow;
+        down_write(&mi->mi_sem);
-        if (mdi->mi_palloc_cache)
+        if (mi->mi_palloc_cache)
-                nilfs_palloc_destroy_cache(inode);
+                nilfs_palloc_clear_cache(inode);
-        nilfs_mdt_clear(inode);
+        nilfs_clear_dirty_pages(inode->i_mapping);
+        nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
+        nilfs_clear_dirty_pages(&ii->i_btnode_cache);
+        nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
+        nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
+        up_write(&mi->mi_sem);
+}
+/**
+ * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches
+ * @inode: inode of the metadata file
+ */
+void nilfs_mdt_clear_shadow_map(struct inode *inode)
+{
+        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+        struct nilfs_shadow_map *shadow = mi->mi_shadow;
-        kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+        down_write(&mi->mi_sem);
-        kfree(mdi);
+        nilfs_release_frozen_buffers(shadow);
-        nilfs_destroy_inode(inode);
+        truncate_inode_pages(&shadow->frozen_data, 0);
+        truncate_inode_pages(&shadow->frozen_btnodes, 0);
+        up_write(&mi->mi_sem);
 }
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 6c4bbb0470fc..b13734bf3521 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -28,26 +28,33 @@
 #include "nilfs.h"
 #include "page.h"
+struct nilfs_shadow_map {
+        struct nilfs_bmap_store bmap_store;
+        struct address_space frozen_data;
+        struct address_space frozen_btnodes;
+        struct list_head frozen_buffers;
+};
 /**
 * struct nilfs_mdt_info - on-memory private data of meta data files
- * @mi_nilfs: back pointer to the_nilfs struct
 * @mi_sem: reader/writer semaphore for meta data operations
 * @mi_bgl: per-blockgroup locking
 * @mi_entry_size: size of an entry
 * @mi_first_entry_offset: offset to the first entry
 * @mi_entries_per_block: number of entries in a block
 * @mi_palloc_cache: persistent object allocator cache
+ * @mi_shadow: shadow of bmap and page caches
 * @mi_blocks_per_group: number of blocks in a group
 * @mi_blocks_per_desc_block: number of blocks per descriptor block
 */
 struct nilfs_mdt_info {
-        struct the_nilfs       *mi_nilfs;
        struct rw_semaphore     mi_sem;
        struct blockgroup_lock *mi_bgl;
        unsigned                mi_entry_size;
        unsigned                mi_first_entry_offset;
        unsigned long           mi_entries_per_block;
        struct nilfs_palloc_cache *mi_palloc_cache;
+        struct nilfs_shadow_map *mi_shadow;
        unsigned long           mi_blocks_per_group;
        unsigned long           mi_blocks_per_desc_block;
 };
@@ -59,9 +66,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
 static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
 {
-        struct super_block *sb = inode->i_sb;
+        return NILFS_SB(inode->i_sb)->s_nilfs;
-        return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
 }
 /* Default GFP flags using highmem */
@@ -76,14 +81,17 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
-struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
+int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz);
-                            size_t);
-struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
-                                   ino_t, gfp_t, size_t);
-void nilfs_mdt_destroy(struct inode *);
 void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
-void nilfs_mdt_set_shadow(struct inode *, struct inode *);
+int nilfs_mdt_setup_shadow_map(struct inode *inode,
+                               struct nilfs_shadow_map *shadow);
+int nilfs_mdt_save_to_shadow_map(struct inode *inode);
+void nilfs_mdt_restore_from_shadow_map(struct inode *inode);
+void nilfs_mdt_clear_shadow_map(struct inode *inode);
+int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
+struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
+                                                struct buffer_head *bh);
 #define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
@@ -100,7 +108,7 @@ static inline void nilfs_mdt_clear_dirty(struct inode *inode)
 static inline __u64 nilfs_mdt_cno(struct inode *inode)
 {
-        return NILFS_MDT(inode)->mi_nilfs->ns_cno;
+        return NILFS_I_NILFS(inode)->ns_cno;
 }
 #define nilfs_mdt_bgl_lock(inode, bg) \
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ad6ed2cf19b4..6e9557ecf161 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -40,7 +40,11 @@
 #include <linux/pagemap.h>
 #include "nilfs.h"
+#include "export.h"
+#define NILFS_FID_SIZE_NON_CONNECTABLE \
+        (offsetof(struct nilfs_fid, parent_gen) / 4)
+#define NILFS_FID_SIZE_CONNECTABLE      (sizeof(struct nilfs_fid) / 4)
 static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
 {
@@ -70,29 +74,13 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        ino = nilfs_inode_by_name(dir, &dentry->d_name);
        inode = NULL;
        if (ino) {
-                inode = nilfs_iget(dir->i_sb, ino);
+                inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
                if (IS_ERR(inode))
                        return ERR_CAST(inode);
        }
        return d_splice_alias(inode, dentry);
 }
-struct dentry *nilfs_get_parent(struct dentry *child)
-{
-        unsigned long ino;
-        struct inode *inode;
-        struct qstr dotdot = {.name = "..", .len = 2};
-        ino = nilfs_inode_by_name(child->d_inode, &dotdot);
-        if (!ino)
-                return ERR_PTR(-ENOENT);
-        inode = nilfs_iget(child->d_inode->i_sb, ino);
-        if (IS_ERR(inode))
-                return ERR_CAST(inode);
-        return d_obtain_alias(inode);
-}
 /*
 * By the time this is called, we already have created
 * the directory cache entry for the new file, but it
@@ -219,7 +207,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
        inode->i_ctime = CURRENT_TIME;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        err = nilfs_add_nondir(dentry, inode);
        if (!err)
@@ -468,6 +456,115 @@ out:
        return err;
 }
+/*
+ * Export operations
+ */
+static struct dentry *nilfs_get_parent(struct dentry *child)
+{
+        unsigned long ino;
+        struct inode *inode;
+        struct qstr dotdot = {.name = "..", .len = 2};
+        struct nilfs_root *root;
+        ino = nilfs_inode_by_name(child->d_inode, &dotdot);
+        if (!ino)
+                return ERR_PTR(-ENOENT);
+        root = NILFS_I(child->d_inode)->i_root;
+        inode = nilfs_iget(child->d_inode->i_sb, root, ino);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        return d_obtain_alias(inode);
+}
+static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
+                                       u64 ino, u32 gen)
+{
+        struct nilfs_root *root;
+        struct inode *inode;
+        if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
+                return ERR_PTR(-ESTALE);
+        root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+        if (!root)
+                return ERR_PTR(-ESTALE);
+        inode = nilfs_iget(sb, root, ino);
+        nilfs_put_root(root);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        if (gen && inode->i_generation != gen) {
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        return d_obtain_alias(inode);
+}
+static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+                                         int fh_len, int fh_type)
+{
+        struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+        if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
+             fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
+            (fh_type != FILEID_NILFS_WITH_PARENT &&
+             fh_type != FILEID_NILFS_WITHOUT_PARENT))
+                return NULL;
+        return nilfs_get_dentry(sb, fid->cno, fid->ino, fid->gen);
+}
+static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+                                         int fh_len, int fh_type)
+{
+        struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+        if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
+            fh_type != FILEID_NILFS_WITH_PARENT)
+                return NULL;
+        return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
+}
+static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
+                           int connectable)
+{
+        struct nilfs_fid *fid = (struct nilfs_fid *)fh;
+        struct inode *inode = dentry->d_inode;
+        struct nilfs_root *root = NILFS_I(inode)->i_root;
+        int type;
+        if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE ||
+            (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE))
+                return 255;
+        fid->cno = root->cno;
+        fid->ino = inode->i_ino;
+        fid->gen = inode->i_generation;
+        if (connectable && !S_ISDIR(inode->i_mode)) {
+                struct inode *parent;
+                spin_lock(&dentry->d_lock);
+                parent = dentry->d_parent->d_inode;
+                fid->parent_ino = parent->i_ino;
+                fid->parent_gen = parent->i_generation;
+                spin_unlock(&dentry->d_lock);
+                type = FILEID_NILFS_WITH_PARENT;
+                *lenp = NILFS_FID_SIZE_CONNECTABLE;
+        } else {
+                type = FILEID_NILFS_WITHOUT_PARENT;
+                *lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
+        }
+        return type;
+}
 const struct inode_operations nilfs_dir_inode_operations = {
        .create         = nilfs_create,
        .lookup         = nilfs_lookup,
@@ -491,4 +588,12 @@ const struct inode_operations nilfs_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .permission     = nilfs_permission,
+};
+const struct export_operations nilfs_export_ops = {
+        .encode_fh = nilfs_encode_fh,
+        .fh_to_dentry = nilfs_fh_to_dentry,
+        .fh_to_parent = nilfs_fh_to_parent,
+        .get_parent = nilfs_get_parent,
 };
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index d3d54046e5f8..f7560da5a567 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -59,6 +59,7 @@ struct nilfs_inode_info {
 #endif
        struct buffer_head *i_bh;       /* i_bh contains a new or dirty
                                           disk inode */
+        struct nilfs_root *i_root;
        struct inode vfs_inode;
 };
@@ -100,7 +101,6 @@ enum {
        NILFS_I_INODE_DIRTY,            /* write_inode is requested */
        NILFS_I_BMAP,                   /* has bmap and btnode_cache */
        NILFS_I_GCINODE,                /* inode for GC, on memory only */
-        NILFS_I_GCDAT,                  /* shadow DAT, on memory only */
 };
 /*
@@ -192,7 +192,7 @@ static inline int nilfs_doing_construction(void)
 static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
 {
-        return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat;
+        return nilfs->ns_dat;
 }
 /*
@@ -200,12 +200,9 @@ static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
 */
 #ifdef CONFIG_NILFS_POSIX_ACL
 #error "NILFS: not yet supported POSIX ACL"
-extern int nilfs_permission(struct inode *, int, struct nameidata *);
 extern int nilfs_acl_chmod(struct inode *);
 extern int nilfs_init_acl(struct inode *, struct inode *);
 #else
-#define nilfs_permission   NULL
 static inline int nilfs_acl_chmod(struct inode *inode)
 {
        return 0;
@@ -247,11 +244,19 @@ extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern void nilfs_set_inode_flags(struct inode *);
 extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
 extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
-extern struct inode *nilfs_iget(struct super_block *, unsigned long);
+struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
+                            unsigned long ino);
+struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
+                                unsigned long ino);
+struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
+                         unsigned long ino);
+extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
+                                       unsigned long ino, __u64 cno);
 extern void nilfs_update_inode(struct inode *, struct buffer_head *);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
+int nilfs_permission(struct inode *inode, int mask);
 extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
                                  struct buffer_head **);
 extern int nilfs_inode_dirty(struct inode *);
@@ -260,11 +265,7 @@ extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
 extern int nilfs_mark_inode_dirty(struct inode *);
 extern void nilfs_dirty_inode(struct inode *);
-/* namei.c */
-extern struct dentry *nilfs_get_parent(struct dentry *);
 /* super.c */
-extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
 extern struct inode *nilfs_alloc_inode(struct super_block *);
 extern void nilfs_destroy_inode(struct inode *);
 extern void nilfs_error(struct super_block *, const char *, const char *, ...)
@@ -283,8 +284,9 @@ extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
                                                      int flip);
 extern int nilfs_commit_super(struct nilfs_sb_info *, int);
 extern int nilfs_cleanup_super(struct nilfs_sb_info *);
-extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
+int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
-extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
+                            struct nilfs_root **root);
+int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
 /* gcinode.c */
 int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
@@ -292,16 +294,8 @@ int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
 int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
                                   struct buffer_head **);
 int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
-int nilfs_init_gccache(struct the_nilfs *);
+int nilfs_init_gcinode(struct inode *inode);
-void nilfs_destroy_gccache(struct the_nilfs *);
+void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
-void nilfs_clear_gcinode(struct inode *);
-struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
-void nilfs_remove_all_gcinode(struct the_nilfs *);
-/* gcdat.c */
-int nilfs_init_gcdat_inode(struct the_nilfs *);
-void nilfs_commit_gcdat_inode(struct the_nilfs *);
-void nilfs_clear_gcdat_inode(struct the_nilfs *);
 /*
 * Inodes and files operations
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index aab11db2cb08..a6c3c2e817f8 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -79,8 +79,8 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
 {
        int blkbits = inode->i_blkbits;
        pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
-        struct page *page, *opage;
+        struct page *page;
-        struct buffer_head *bh, *obh;
+        struct buffer_head *bh;
        page = grab_cache_page(mapping, index);
        if (unlikely(!page))
@@ -92,30 +92,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
                page_cache_release(page);
                return NULL;
        }
-        if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
-                /*
-                 * Shadow page cache uses assoc_mapping to point its original
-                 * page cache.  The following code tries the original cache
-                 * if the given cache is a shadow and it didn't hit.
-                 */
-                opage = find_lock_page(mapping->assoc_mapping, index);
-                if (!opage)
-                        return bh;
-                obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
-                                             b_state);
-                if (buffer_uptodate(obh)) {
-                        nilfs_copy_buffer(bh, obh);
-                        if (buffer_dirty(obh)) {
-                                nilfs_mark_buffer_dirty(bh);
-                                if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
-                                        nilfs_mdt_mark_dirty(inode);
-                        }
-                }
-                brelse(obh);
-                unlock_page(opage);
-                page_cache_release(opage);
-        }
        return bh;
 }
@@ -131,6 +107,7 @@ void nilfs_forget_buffer(struct buffer_head *bh)
        lock_buffer(bh);
        clear_buffer_nilfs_volatile(bh);
        clear_buffer_nilfs_checked(bh);
+        clear_buffer_nilfs_redirected(bh);
        clear_buffer_dirty(bh);
        if (nilfs_page_buffers_clean(page))
                __nilfs_clear_page_dirty(page);
@@ -483,6 +460,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
                                clear_buffer_dirty(bh);
                                clear_buffer_nilfs_volatile(bh);
                                clear_buffer_nilfs_checked(bh);
+                                clear_buffer_nilfs_redirected(bh);
                                clear_buffer_uptodate(bh);
                                clear_buffer_mapped(bh);
                                unlock_buffer(bh);
@@ -513,6 +491,31 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        }
        return nc;
 }
+ 
+void nilfs_mapping_init_once(struct address_space *mapping)
+{
+        memset(mapping, 0, sizeof(*mapping));
+        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+        spin_lock_init(&mapping->tree_lock);
+        INIT_LIST_HEAD(&mapping->private_list);
+        spin_lock_init(&mapping->private_lock);
+        spin_lock_init(&mapping->i_mmap_lock);
+        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+}
+void nilfs_mapping_init(struct address_space *mapping,
+                        struct backing_dev_info *bdi,
+                        const struct address_space_operations *aops)
+{
+        mapping->host = NULL;
+        mapping->flags = 0;
+        mapping_set_gfp_mask(mapping, GFP_NOFS);
+        mapping->assoc_mapping = NULL;
+        mapping->backing_dev_info = bdi;
+        mapping->a_ops = aops;
+}
 /*
 * NILFS2 needs clear_page_dirty() in the following two cases:
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index f53d8da41ed7..fb9e8a8a2038 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -35,12 +35,14 @@ enum {
        BH_NILFS_Node,
        BH_NILFS_Volatile,
        BH_NILFS_Checked,
+        BH_NILFS_Redirected,
 };
 BUFFER_FNS(NILFS_Allocated, nilfs_allocated)    /* nilfs private buffers */
 BUFFER_FNS(NILFS_Node, nilfs_node)              /* nilfs node buffers */
 BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
 BUFFER_FNS(NILFS_Checked, nilfs_checked)        /* buffer is verified */
+BUFFER_FNS(NILFS_Redirected, nilfs_redirected)  /* redirected to a copy */
 void nilfs_mark_buffer_dirty(struct buffer_head *bh);
@@ -59,6 +61,10 @@ void nilfs_free_private_page(struct page *);
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
+void nilfs_mapping_init_once(struct address_space *mapping);
+void nilfs_mapping_init(struct address_space *mapping,
+                        struct backing_dev_info *bdi,
+                        const struct address_space_operations *aops);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 #define NILFS_PAGE_BUG(page, m, a...) \
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d0c35ef39f6a..5d2711c28da7 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -440,7 +440,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
        segnum[2] = ri->ri_segnum;
        segnum[3] = ri->ri_nextnum;
-        nilfs_attach_writer(nilfs, sbi);
        /*
         * Releasing the next segment of the latest super root.
         * The next segment is invalidated by this recovery.
@@ -480,7 +479,6 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
 failed:
        /* No need to recover sufile because it will be destroyed on error */
-        nilfs_detach_writer(nilfs, sbi);
        return err;
 }
@@ -504,6 +502,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
 static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
                                      struct nilfs_sb_info *sbi,
+                                      struct nilfs_root *root,
                                      struct list_head *head,
                                      unsigned long *nr_salvaged_blocks)
 {
@@ -515,7 +514,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
        int err = 0, err2 = 0;
        list_for_each_entry_safe(rb, n, head, list) {
-                inode = nilfs_iget(sbi->s_super, rb->ino);
+                inode = nilfs_iget(sbi->s_super, root, rb->ino);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        inode = NULL;
@@ -578,6 +577,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 */
 static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                                 struct nilfs_sb_info *sbi,
+                                 struct nilfs_root *root,
                                 struct nilfs_recovery_info *ri)
 {
        struct buffer_head *bh_sum = NULL;
@@ -597,7 +597,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
        };
        int state = RF_INIT_ST;
-        nilfs_attach_writer(nilfs, sbi);
        pseg_start = ri->ri_lsegs_start;
        seg_seq = ri->ri_lsegs_start_seq;
        segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
@@ -649,7 +648,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                                goto failed;
                        if (flags & NILFS_SS_LOGEND) {
                                err = nilfs_recover_dsync_blocks(
-                                        nilfs, sbi, &dsync_blocks,
+                                        nilfs, sbi, root, &dsync_blocks,
                                        &nsalvaged_blocks);
                                if (unlikely(err))
                                        goto failed;
@@ -688,7 +687,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 out:
        brelse(bh_sum);
        dispose_recovery_list(&dsync_blocks);
-        nilfs_detach_writer(nilfs, sbi);
        return err;
 confused:
@@ -746,19 +744,20 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
                              struct nilfs_sb_info *sbi,
                              struct nilfs_recovery_info *ri)
 {
+        struct nilfs_root *root;
        int err;
        if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
                return 0;
-        err = nilfs_attach_checkpoint(sbi, ri->ri_cno);
+        err = nilfs_attach_checkpoint(sbi, ri->ri_cno, true, &root);
        if (unlikely(err)) {
                printk(KERN_ERR
                       "NILFS: error loading the latest checkpoint.\n");
                return err;
        }
-        err = nilfs_do_roll_forward(nilfs, sbi, ri);
+        err = nilfs_do_roll_forward(nilfs, sbi, root, ri);
        if (unlikely(err))
                goto failed;
@@ -770,7 +769,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
                        goto failed;
                }
-                err = nilfs_attach_segment_constructor(sbi);
+                err = nilfs_attach_segment_constructor(sbi, root);
                if (unlikely(err))
                        goto failed;
@@ -788,7 +787,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
        }
 failed:
-        nilfs_detach_checkpoint(sbi);
+        nilfs_put_root(root);
        return err;
 }
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 0776ccc2504a..35a07157b980 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -42,11 +42,6 @@ struct nilfs_sc_info;
 * NILFS super-block data in memory
 */
 struct nilfs_sb_info {
-        /* Snapshot status */
-        __u64 s_snapshot_cno;           /* Checkpoint number */
-        atomic_t s_inodes_count;
-        atomic_t s_blocks_count;        /* Reserved (might be deleted) */
        /* Mount options */
        unsigned long s_mount_opt;
        uid_t s_resuid;
@@ -59,8 +54,6 @@ struct nilfs_sb_info {
        /* Fundamental members */
        struct super_block *s_super;    /* reverse pointer to super_block */
        struct the_nilfs *s_nilfs;
-        struct list_head s_list;        /* list head for nilfs->ns_supers */
-        atomic_t s_count;               /* reference count */
        /* Segment constructor */
        struct list_head s_dirty_files; /* dirty files list */
@@ -68,9 +61,6 @@ struct nilfs_sb_info {
        spinlock_t s_inode_lock;        /* Lock for the nilfs inode.
                                           It covers s_dirty_files list */
-        /* Metadata files */
-        struct inode *s_ifile;          /* index file inode */
        /* Inode allocator */
        spinlock_t s_next_gen_lock;
        u32 s_next_generation;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 4588fb9e93df..0f83e93935b2 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -371,7 +371,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
        struct bio *bio = wi->bio;
        int err;
-        if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) {
+        if (segbuf->sb_nbio > 0 &&
+            bdi_write_congested(segbuf->sb_super->s_bdi)) {
                wait_for_completion(&segbuf->sb_bio_event);
                segbuf->sb_nbio--;
                if (unlikely(atomic_read(&segbuf->sb_err))) {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9fd051a33c4f..687d090cea34 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -191,6 +191,8 @@ int nilfs_transaction_begin(struct super_block *sb,
        if (ret > 0)
                return 0;
+        vfs_check_frozen(sb, SB_FREEZE_WRITE);
        sbi = NILFS_SB(sb);
        nilfs = sbi->s_nilfs;
        down_read(&nilfs->ns_segctor_sem);
@@ -366,8 +368,7 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
        if (nilfs_doing_gc())
                flags = NILFS_SS_GC;
-        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
+        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, sci->sc_cno);
-                                 sci->sc_sbi->s_nilfs->ns_cno);
        if (unlikely(err))
                return err;
@@ -440,17 +441,26 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
        struct nilfs_finfo *finfo;
        struct nilfs_inode_info *ii;
        struct nilfs_segment_buffer *segbuf;
+        __u64 cno;
        if (sci->sc_blk_cnt == 0)
                return;
        ii = NILFS_I(inode);
+        if (test_bit(NILFS_I_GCINODE, &ii->i_state))
+                cno = ii->i_cno;
+        else if (NILFS_ROOT_METADATA_FILE(inode->i_ino))
+                cno = 0;
+        else
+                cno = sci->sc_cno;
        finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
                                                 sizeof(*finfo));
        finfo->fi_ino = cpu_to_le64(inode->i_ino);
        finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
        finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
-        finfo->fi_cno = cpu_to_le64(ii->i_cno);
+        finfo->fi_cno = cpu_to_le64(cno);
        segbuf = sci->sc_curseg;
        segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
@@ -755,12 +765,12 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
        }
 }
-static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi)
+static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
+                                     struct nilfs_root *root)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        int ret = 0;
-        if (nilfs_mdt_fetch_dirty(sbi->s_ifile))
+        if (nilfs_mdt_fetch_dirty(root->ifile))
                ret++;
        if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
                ret++;
@@ -785,7 +795,7 @@ static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
        struct nilfs_sb_info *sbi = sci->sc_sbi;
        int ret = 0;
-        if (nilfs_test_metadata_dirty(sbi))
+        if (nilfs_test_metadata_dirty(sbi->s_nilfs, sci->sc_root))
                set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
        spin_lock(&sbi->s_inode_lock);
@@ -801,7 +811,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
        struct nilfs_sb_info *sbi = sci->sc_sbi;
        struct the_nilfs *nilfs = sbi->s_nilfs;
-        nilfs_mdt_clear_dirty(sbi->s_ifile);
+        nilfs_mdt_clear_dirty(sci->sc_root->ifile);
        nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
        nilfs_mdt_clear_dirty(nilfs->ns_sufile);
        nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
@@ -848,9 +858,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
        raw_cp->cp_snapshot_list.ssl_next = 0;
        raw_cp->cp_snapshot_list.ssl_prev = 0;
        raw_cp->cp_inodes_count =
-                cpu_to_le64(atomic_read(&sbi->s_inodes_count));
+                cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
        raw_cp->cp_blocks_count =
-                cpu_to_le64(atomic_read(&sbi->s_blocks_count));
+                cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
        raw_cp->cp_nblk_inc =
                cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
        raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
@@ -861,7 +871,8 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
        else
                nilfs_checkpoint_set_minor(raw_cp);
-        nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
+        nilfs_write_inode_common(sci->sc_root->ifile,
+                                 &raw_cp->cp_ifile_inode, 1);
        nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
        return 0;
@@ -886,13 +897,12 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile,
        }
 }
-static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
+static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
-                                            struct inode *ifile)
 {
        struct nilfs_inode_info *ii;
        list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
-                nilfs_fill_in_file_bmap(ifile, ii);
+                nilfs_fill_in_file_bmap(sci->sc_root->ifile, ii);
                set_bit(NILFS_I_COLLECTED, &ii->i_state);
        }
 }
@@ -1135,7 +1145,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
                /* Fall through */
        case NILFS_ST_IFILE:
-                err = nilfs_segctor_scan_file(sci, sbi->s_ifile,
+                err = nilfs_segctor_scan_file(sci, sci->sc_root->ifile,
                                              &nilfs_sc_file_ops);
                if (unlikely(err))
                        break;
@@ -1599,7 +1609,7 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
        kunmap_atomic(kaddr, KM_USER0);
        if (!TestSetPageWriteback(clone_page))
-                inc_zone_page_state(clone_page, NR_WRITEBACK);
+                account_page_writeback(clone_page);
        unlock_page(clone_page);
        return 0;
@@ -1900,6 +1910,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
                        set_buffer_uptodate(bh);
                        clear_buffer_dirty(bh);
                        clear_buffer_nilfs_volatile(bh);
+                        clear_buffer_nilfs_redirected(bh);
                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        end_page_writeback(bd_page);
@@ -1936,11 +1947,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
        nilfs_drop_collected_inodes(&sci->sc_dirty_files);
-        if (nilfs_doing_gc()) {
+        if (nilfs_doing_gc())
                nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
-                if (update_sr)
+        else
-                        nilfs_commit_gcdat_inode(nilfs);
-        } else
                nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
        sci->sc_nblk_inc += sci->sc_nblk_this_inc;
@@ -1976,7 +1985,7 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
                                        struct nilfs_sb_info *sbi)
 {
        struct nilfs_inode_info *ii, *n;
-        __u64 cno = sbi->s_nilfs->ns_cno;
+        struct inode *ifile = sci->sc_root->ifile;
        spin_lock(&sbi->s_inode_lock);
 retry:
@@ -1987,14 +1996,14 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
                        spin_unlock(&sbi->s_inode_lock);
                        err = nilfs_ifile_get_inode_block(
-                                sbi->s_ifile, ii->vfs_inode.i_ino, &ibh);
+                                ifile, ii->vfs_inode.i_ino, &ibh);
                        if (unlikely(err)) {
                                nilfs_warning(sbi->s_super, __func__,
                                              "failed to get inode block.\n");
                                return err;
                        }
                        nilfs_mdt_mark_buffer_dirty(ibh);
-                        nilfs_mdt_mark_dirty(sbi->s_ifile);
+                        nilfs_mdt_mark_dirty(ifile);
                        spin_lock(&sbi->s_inode_lock);
                        if (likely(!ii->i_bh))
                                ii->i_bh = ibh;
@@ -2002,7 +2011,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
                                brelse(ibh);
                        goto retry;
                }
-                ii->i_cno = cno;
                clear_bit(NILFS_I_QUEUED, &ii->i_state);
                set_bit(NILFS_I_BUSY, &ii->i_state);
@@ -2011,8 +2019,6 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
        }
        spin_unlock(&sbi->s_inode_lock);
-        NILFS_I(sbi->s_ifile)->i_cno = cno;
        return 0;
 }
@@ -2021,19 +2027,13 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
 {
        struct nilfs_transaction_info *ti = current->journal_info;
        struct nilfs_inode_info *ii, *n;
-        __u64 cno = sbi->s_nilfs->ns_cno;
        spin_lock(&sbi->s_inode_lock);
        list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
                if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
-                    test_bit(NILFS_I_DIRTY, &ii->i_state)) {
+                    test_bit(NILFS_I_DIRTY, &ii->i_state))
-                        /* The current checkpoint number (=nilfs->ns_cno) is
-                           changed between check-in and check-out only if the
-                           super root is written out.  So, we can update i_cno
-                           for the inodes that remain in the dirty list. */
-                        ii->i_cno = cno;
                        continue;
-                }
                clear_bit(NILFS_I_BUSY, &ii->i_state);
                brelse(ii->i_bh);
                ii->i_bh = NULL;
@@ -2054,12 +2054,13 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
        int err;
        sci->sc_stage.scnt = NILFS_ST_INIT;
+        sci->sc_cno = nilfs->ns_cno;
        err = nilfs_segctor_check_in_files(sci, sbi);
        if (unlikely(err))
                goto out;
-        if (nilfs_test_metadata_dirty(sbi))
+        if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
                set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
        if (nilfs_segctor_clean(sci))
@@ -2091,7 +2092,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                        goto failed;
                if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
-                        nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
+                        nilfs_segctor_fill_in_file_bmap(sci);
                if (mode == SC_LSEG_SR &&
                    sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
@@ -2452,9 +2453,8 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
        list_for_each_entry_safe(ii, n, head, i_dirty) {
                if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
                        continue;
-                hlist_del_init(&ii->vfs_inode.i_hash);
                list_del_init(&ii->i_dirty);
-                nilfs_clear_gcinode(&ii->vfs_inode);
+                iput(&ii->vfs_inode);
        }
 }
@@ -2472,13 +2472,15 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
        nilfs_transaction_lock(sbi, &ti, 1);
-        err = nilfs_init_gcdat_inode(nilfs);
+        err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
        if (unlikely(err))
                goto out_unlock;
        err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
-        if (unlikely(err))
+        if (unlikely(err)) {
+                nilfs_mdt_restore_from_shadow_map(nilfs->ns_dat);
                goto out_unlock;
+        }
        sci->sc_freesegs = kbufs[4];
        sci->sc_nfreesegs = argv[4].v_nmembs;
@@ -2510,7 +2512,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 out_unlock:
        sci->sc_freesegs = NULL;
        sci->sc_nfreesegs = 0;
-        nilfs_clear_gcdat_inode(nilfs);
+        nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
        nilfs_transaction_unlock(sbi);
        return err;
 }
@@ -2672,6 +2674,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
 }
 static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
+        __acquires(&sci->sc_state_lock)
+        __releases(&sci->sc_state_lock)
 {
        sci->sc_state |= NILFS_SEGCTOR_QUIT;
@@ -2686,7 +2690,8 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
 /*
 * Setup & clean-up functions
 */
-static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
+static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
+                                               struct nilfs_root *root)
 {
        struct nilfs_sc_info *sci;
@@ -2697,6 +2702,9 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
        sci->sc_sbi = sbi;
        sci->sc_super = sbi->s_super;
+        nilfs_get_root(root);
+        sci->sc_root = root;
        init_waitqueue_head(&sci->sc_wait_request);
        init_waitqueue_head(&sci->sc_wait_daemon);
        init_waitqueue_head(&sci->sc_wait_task);
@@ -2771,6 +2779,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
        WARN_ON(!list_empty(&sci->sc_segbufs));
        WARN_ON(!list_empty(&sci->sc_write_logs));
+        nilfs_put_root(sci->sc_root);
        down_write(&sbi->s_nilfs->ns_segctor_sem);
        del_timer_sync(&sci->sc_timer);
@@ -2780,6 +2790,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 /**
 * nilfs_attach_segment_constructor - attach a segment constructor
 * @sbi: nilfs_sb_info
+ * @root: root object of the current filesystem tree
 *
 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
 * initializes it, and starts the segment constructor.
@@ -2789,9 +2800,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 *
 * %-ENOMEM - Insufficient memory available.
 */
-int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
+int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
+                                     struct nilfs_root *root)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        int err;
        if (NILFS_SC(sbi)) {
@@ -2803,14 +2814,12 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
                nilfs_detach_segment_constructor(sbi);
        }
-        sbi->s_sc_info = nilfs_segctor_new(sbi);
+        sbi->s_sc_info = nilfs_segctor_new(sbi, root);
        if (!sbi->s_sc_info)
                return -ENOMEM;
-        nilfs_attach_writer(nilfs, sbi);
        err = nilfs_segctor_start_thread(NILFS_SC(sbi));
        if (err) {
-                nilfs_detach_writer(nilfs, sbi);
                kfree(sbi->s_sc_info);
                sbi->s_sc_info = NULL;
        }
@@ -2847,5 +2856,4 @@ void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
        up_write(&nilfs->ns_segctor_sem);
        nilfs_dispose_list(sbi, &garbage_list, 1);
-        nilfs_detach_writer(nilfs, sbi);
 }
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 17c487bd8152..cd8056e7cbed 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -29,6 +29,8 @@
 #include <linux/nilfs2_fs.h>
 #include "sb.h"
+struct nilfs_root;
 /**
 * struct nilfs_recovery_info - Recovery information
 * @ri_need_recovery: Recovery status
@@ -87,6 +89,7 @@ struct nilfs_segsum_pointer {
 * struct nilfs_sc_info - Segment constructor information
 * @sc_super: Back pointer to super_block struct
 * @sc_sbi: Back pointer to nilfs_sb_info struct
+ * @sc_root: root object of the current filesystem tree
 * @sc_nblk_inc: Block count of current generation
 * @sc_dirty_files: List of files to be written
 * @sc_gc_inodes: List of GC inodes having blocks to be written
@@ -107,6 +110,7 @@ struct nilfs_segsum_pointer {
 * @sc_datablk_cnt: Data block count of a file
 * @sc_nblk_this_inc: Number of blocks included in the current logical segment
 * @sc_seg_ctime: Creation time
+ * @sc_cno: checkpoint number of current log
 * @sc_flags: Internal flags
 * @sc_state_lock: spinlock for sc_state and so on
 * @sc_state: Segctord state flags
@@ -128,6 +132,7 @@ struct nilfs_segsum_pointer {
 struct nilfs_sc_info {
        struct super_block     *sc_super;
        struct nilfs_sb_info   *sc_sbi;
+        struct nilfs_root      *sc_root;
        unsigned long           sc_nblk_inc;
@@ -156,7 +161,7 @@ struct nilfs_sc_info {
        unsigned long           sc_datablk_cnt;
        unsigned long           sc_nblk_this_inc;
        time_t                  sc_seg_ctime;
+        __u64                   sc_cno;
        unsigned long           sc_flags;
        spinlock_t              sc_state_lock;
@@ -230,7 +235,8 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
 extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
                                void **);
-extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
+int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
+                                     struct nilfs_root *root);
 extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
 /* recovery.c */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3c6cc6005c2e..1d6f488ccae8 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -505,7 +505,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 {
        struct buffer_head *header_bh;
        struct nilfs_sufile_header *header;
-        struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+        struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
        void *kaddr;
        int ret;
@@ -583,7 +583,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
        struct nilfs_segment_usage *su;
        struct nilfs_suinfo *si = buf;
        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
-        struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+        struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
        void *kaddr;
        unsigned long nsegs, segusages_per_block;
        ssize_t n;
@@ -635,46 +635,55 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 }
 /**
- * nilfs_sufile_read - read sufile inode
+ * nilfs_sufile_read - read or get sufile inode
- * @sufile: sufile inode
+ * @sb: super block instance
+ * @susize: size of a segment usage entry
 * @raw_inode: on-disk sufile inode
+ * @inodep: buffer to store the inode
 */
-int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode)
+int nilfs_sufile_read(struct super_block *sb, size_t susize,
+                      struct nilfs_inode *raw_inode, struct inode **inodep)
 {
-        struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+        struct inode *sufile;
+        struct nilfs_sufile_info *sui;
        struct buffer_head *header_bh;
        struct nilfs_sufile_header *header;
        void *kaddr;
-        int ret;
+        int err;
-        ret = nilfs_read_inode_common(sufile, raw_inode);
+        sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
-        if (ret < 0)
+        if (unlikely(!sufile))
-                return ret;
+                return -ENOMEM;
+        if (!(sufile->i_state & I_NEW))
+                goto out;
-        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui));
-        if (!ret) {
+        if (err)
-                kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+                goto failed;
-                header = kaddr + bh_offset(header_bh);
-                sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
-                kunmap_atomic(kaddr, KM_USER0);
-                brelse(header_bh);
-        }
-        return ret;
-}
-/**
+        nilfs_mdt_set_entry_size(sufile, susize,
- * nilfs_sufile_new - create sufile
+                                 sizeof(struct nilfs_sufile_header));
- * @nilfs: nilfs object
- * @susize: size of a segment usage entry
+        err = nilfs_read_inode_common(sufile, raw_inode);
- */
+        if (err)
-struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize)
+                goto failed;
-{
-        struct inode *sufile;
+        err = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (err)
+                goto failed;
-        sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO,
+        sui = NILFS_SUI(sufile);
-                               sizeof(struct nilfs_sufile_info));
+        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-        if (sufile)
+        header = kaddr + bh_offset(header_bh);
-                nilfs_mdt_set_entry_size(sufile, susize,
+        sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
-                                         sizeof(struct nilfs_sufile_header));
+        kunmap_atomic(kaddr, KM_USER0);
-        return sufile;
+        brelse(header_bh);
+        unlock_new_inode(sufile);
+ out:
+        *inodep = sufile;
+        return 0;
+ failed:
+        iget_failed(sufile);
+        return err;
 }
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 15163b8aff7d..a943fbacb45b 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -31,7 +31,7 @@
 static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
 {
-        return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
+        return NILFS_I_NILFS(sufile)->ns_nsegments;
 }
 unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
@@ -61,8 +61,8 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
                               struct buffer_head *);
-int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode);
+int nilfs_sufile_read(struct super_block *sb, size_t susize,
-struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize);
+                      struct nilfs_inode *raw_inode, struct inode **inodep);
 /**
 * nilfs_sufile_scrap - make a segment garbage
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 922263393c76..f804d41ec9d3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -45,14 +45,13 @@
 #include <linux/parser.h>
 #include <linux/random.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
 #include <linux/kobject.h>
-#include <linux/exportfs.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "nilfs.h"
+#include "export.h"
 #include "mdt.h"
 #include "alloc.h"
 #include "btree.h"
@@ -69,11 +68,12 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
 MODULE_LICENSE("GPL");
-struct kmem_cache *nilfs_inode_cachep;
+static struct kmem_cache *nilfs_inode_cachep;
 struct kmem_cache *nilfs_transaction_cachep;
 struct kmem_cache *nilfs_segbuf_cachep;
 struct kmem_cache *nilfs_btree_path_cache;
+static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 static void nilfs_set_error(struct nilfs_sb_info *sbi)
@@ -147,7 +147,7 @@ void nilfs_warning(struct super_block *sb, const char *function,
 }
-struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
+struct inode *nilfs_alloc_inode(struct super_block *sb)
 {
        struct nilfs_inode_info *ii;
@@ -156,18 +156,20 @@ struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
                return NULL;
        ii->i_bh = NULL;
        ii->i_state = 0;
+        ii->i_cno = 0;
        ii->vfs_inode.i_version = 1;
-        nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi);
+        nilfs_btnode_cache_init(&ii->i_btnode_cache, sb->s_bdi);
        return &ii->vfs_inode;
 }
-struct inode *nilfs_alloc_inode(struct super_block *sb)
-{
-        return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs);
-}
 void nilfs_destroy_inode(struct inode *inode)
 {
+        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+        if (mdi) {
+                kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+                kfree(mdi);
+        }
        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
@@ -178,17 +180,9 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 retry:
        set_buffer_dirty(nilfs->ns_sbh[0]);
        if (nilfs_test_opt(sbi, BARRIER)) {
                err = __sync_dirty_buffer(nilfs->ns_sbh[0],
-                                          WRITE_SYNC | WRITE_BARRIER);
+                                          WRITE_SYNC | WRITE_FLUSH_FUA);
-                if (err == -EOPNOTSUPP) {
-                        nilfs_warning(sbi->s_super, __func__,
-                                      "barrier-based sync failed. "
-                                      "disabling barriers\n");
-                        nilfs_clear_opt(sbi, BARRIER);
-                        goto retry;
-                }
        } else {
                err = sync_dirty_buffer(nilfs->ns_sbh[0]);
        }
@@ -342,8 +336,6 @@ static void nilfs_put_super(struct super_block *sb)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
-        lock_kernel();
        nilfs_detach_segment_constructor(sbi);
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -351,18 +343,15 @@ static void nilfs_put_super(struct super_block *sb)
                nilfs_cleanup_super(sbi);
                up_write(&nilfs->ns_sem);
        }
-        down_write(&nilfs->ns_super_sem);
-        if (nilfs->ns_current == sbi)
-                nilfs->ns_current = NULL;
-        up_write(&nilfs->ns_super_sem);
-        nilfs_detach_checkpoint(sbi);
+        iput(nilfs->ns_sufile);
-        put_nilfs(sbi->s_nilfs);
+        iput(nilfs->ns_cpfile);
+        iput(nilfs->ns_dat);
+        destroy_nilfs(nilfs);
        sbi->s_super = NULL;
        sb->s_fs_info = NULL;
-        nilfs_put_sbinfo(sbi);
+        kfree(sbi);
-        unlock_kernel();
 }
 static int nilfs_sync_fs(struct super_block *sb, int wait)
@@ -389,21 +378,22 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
        return err;
 }
-int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
+int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
+                            struct nilfs_root **rootp)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_root *root;
        struct nilfs_checkpoint *raw_cp;
        struct buffer_head *bh_cp;
-        int err;
+        int err = -ENOMEM;
-        down_write(&nilfs->ns_super_sem);
+        root = nilfs_find_or_create_root(
-        list_add(&sbi->s_list, &nilfs->ns_supers);
+                nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno);
-        up_write(&nilfs->ns_super_sem);
+        if (!root)
+                return err;
-        err = -ENOMEM;
+        if (root->ifile)
-        sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
+                goto reuse; /* already attached checkpoint */
-        if (!sbi->s_ifile)
-                goto delist;
        down_read(&nilfs->ns_segctor_sem);
        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -419,45 +409,64 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
                }
                goto failed;
        }
-        err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode);
-        if (unlikely(err))
+        err = nilfs_ifile_read(sbi->s_super, root, nilfs->ns_inode_size,
+                               &raw_cp->cp_ifile_inode, &root->ifile);
+        if (err)
                goto failed_bh;
-        atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
-        atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+        atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
+        atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
        nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+ reuse:
+        *rootp = root;
        return 0;
 failed_bh:
        nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
 failed:
-        nilfs_mdt_destroy(sbi->s_ifile);
+        nilfs_put_root(root);
-        sbi->s_ifile = NULL;
+        return err;
+}
+static int nilfs_freeze(struct super_block *sb)
+{
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        int err;
- delist:
+        if (sb->s_flags & MS_RDONLY)
-        down_write(&nilfs->ns_super_sem);
+                return 0;
-        list_del_init(&sbi->s_list);
-        up_write(&nilfs->ns_super_sem);
+        /* Mark super block clean */
+        down_write(&nilfs->ns_sem);
+        err = nilfs_cleanup_super(sbi);
+        up_write(&nilfs->ns_sem);
        return err;
 }
-void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
+static int nilfs_unfreeze(struct super_block *sb)
 {
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
-        nilfs_mdt_destroy(sbi->s_ifile);
+        if (sb->s_flags & MS_RDONLY)
-        sbi->s_ifile = NULL;
+                return 0;
-        down_write(&nilfs->ns_super_sem);
-        list_del_init(&sbi->s_list);
+        down_write(&nilfs->ns_sem);
-        up_write(&nilfs->ns_super_sem);
+        nilfs_setup_super(sbi, false);
+        up_write(&nilfs->ns_sem);
+        return 0;
 }
 static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = root->nilfs;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        unsigned long long blocks;
        unsigned long overhead;
@@ -493,7 +502,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bfree = nfreeblocks;
        buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
                (buf->f_bfree - nrsvblocks) : 0;
-        buf->f_files = atomic_read(&sbi->s_inodes_count);
+        buf->f_files = atomic_read(&root->inodes_count);
        buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
        buf->f_namelen = NILFS_NAME_LEN;
        buf->f_fsid.val[0] = (u32)id;
@@ -506,12 +515,12 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        struct super_block *sb = vfs->mnt_sb;
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
        if (!nilfs_test_opt(sbi, BARRIER))
                seq_puts(seq, ",nobarrier");
-        if (nilfs_test_opt(sbi, SNAPSHOT))
+        if (root->cno != NILFS_CPTREE_CURRENT_CNO)
-                seq_printf(seq, ",cp=%llu",
+                seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
-                           (unsigned long long int)sbi->s_snapshot_cno);
        if (nilfs_test_opt(sbi, ERRORS_PANIC))
                seq_puts(seq, ",errors=panic");
        if (nilfs_test_opt(sbi, ERRORS_CONT))
@@ -537,6 +546,8 @@ static const struct super_operations nilfs_sops = {
        .put_super      = nilfs_put_super,
        /* .write_super    = nilfs_write_super, */
        .sync_fs        = nilfs_sync_fs,
+        .freeze_fs      = nilfs_freeze,
+        .unfreeze_fs    = nilfs_unfreeze,
        /* .write_super_lockfs */
        /* .unlockfs */
        .statfs         = nilfs_statfs,
@@ -545,48 +556,6 @@ static const struct super_operations nilfs_sops = {
        .show_options = nilfs_show_options
 };
-static struct inode *
-nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
-{
-        struct inode *inode;
-        if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
-            ino != NILFS_SKETCH_INO)
-                return ERR_PTR(-ESTALE);
-        inode = nilfs_iget(sb, ino);
-        if (IS_ERR(inode))
-                return ERR_CAST(inode);
-        if (generation && inode->i_generation != generation) {
-                iput(inode);
-                return ERR_PTR(-ESTALE);
-        }
-        return inode;
-}
-static struct dentry *
-nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
-                   int fh_type)
-{
-        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
-                                    nilfs_nfs_get_inode);
-}
-static struct dentry *
-nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
-                   int fh_type)
-{
-        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
-                                    nilfs_nfs_get_inode);
-}
-static const struct export_operations nilfs_export_ops = {
-        .fh_to_dentry = nilfs_fh_to_dentry,
-        .fh_to_parent = nilfs_fh_to_parent,
-        .get_parent = nilfs_get_parent,
-};
 enum {
        Opt_err_cont, Opt_err_panic, Opt_err_ro,
        Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
@@ -612,7 +581,6 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        char *p;
        substring_t args[MAX_OPT_ARGS];
-        int option;
        if (!options)
                return 1;
@@ -650,30 +618,12 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
                        break;
                case Opt_snapshot:
-                        if (match_int(&args[0], &option) || option <= 0)
-                                return 0;
                        if (is_remount) {
-                                if (!nilfs_test_opt(sbi, SNAPSHOT)) {
+                                printk(KERN_ERR
-                                        printk(KERN_ERR
+                                       "NILFS: \"%s\" option is invalid "
-                                               "NILFS: cannot change regular "
+                                       "for remount.\n", p);
-                                               "mount to snapshot.\n");
-                                        return 0;
-                                } else if (option != sbi->s_snapshot_cno) {
-                                        printk(KERN_ERR
-                                               "NILFS: cannot remount to a "
-                                               "different snapshot.\n");
-                                        return 0;
-                                }
-                                break;
-                        }
-                        if (!(sb->s_flags & MS_RDONLY)) {
-                                printk(KERN_ERR "NILFS: cannot mount snapshot "
-                                       "read/write.  A read-only option is "
-                                       "required.\n");
                                return 0;
                        }
-                        sbi->s_snapshot_cno = option;
-                        nilfs_set_opt(sbi, SNAPSHOT);
                        break;
                case Opt_norecovery:
                        nilfs_set_opt(sbi, NORECOVERY);
@@ -701,7 +651,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
                NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
-static int nilfs_setup_super(struct nilfs_sb_info *sbi)
+static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct nilfs_super_block **sbp;
@@ -713,6 +663,9 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
        if (!sbp)
                return -EIO;
+        if (!is_mount)
+                goto skip_mount_setup;
        max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
        mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
@@ -729,9 +682,11 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
                sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
        sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
+        sbp[0]->s_mtime = cpu_to_le64(get_seconds());
+skip_mount_setup:
        sbp[0]->s_state =
                cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
-        sbp[0]->s_mtime = cpu_to_le64(get_seconds());
        /* synchronize sbp[1] with sbp[0] */
        memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
        return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
@@ -798,22 +753,156 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
        return 0;
 }
+static int nilfs_get_root_dentry(struct super_block *sb,
+                                 struct nilfs_root *root,
+                                 struct dentry **root_dentry)
+{
+        struct inode *inode;
+        struct dentry *dentry;
+        int ret = 0;
+        inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
+        if (IS_ERR(inode)) {
+                printk(KERN_ERR "NILFS: get root inode failed\n");
+                ret = PTR_ERR(inode);
+                goto out;
+        }
+        if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
+                iput(inode);
+                printk(KERN_ERR "NILFS: corrupt root inode.\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
+                dentry = d_find_alias(inode);
+                if (!dentry) {
+                        dentry = d_alloc_root(inode);
+                        if (!dentry) {
+                                iput(inode);
+                                ret = -ENOMEM;
+                                goto failed_dentry;
+                        }
+                } else {
+                        iput(inode);
+                }
+        } else {
+                dentry = d_obtain_alias(inode);
+                if (IS_ERR(dentry)) {
+                        ret = PTR_ERR(dentry);
+                        goto failed_dentry;
+                }
+        }
+        *root_dentry = dentry;
+ out:
+        return ret;
+ failed_dentry:
+        printk(KERN_ERR "NILFS: get root dentry failed\n");
+        goto out;
+}
+static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
+                                 struct dentry **root_dentry)
+{
+        struct the_nilfs *nilfs = NILFS_SB(s)->s_nilfs;
+        struct nilfs_root *root;
+        int ret;
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno);
+        up_read(&nilfs->ns_segctor_sem);
+        if (ret < 0) {
+                ret = (ret == -ENOENT) ? -EINVAL : ret;
+                goto out;
+        } else if (!ret) {
+                printk(KERN_ERR "NILFS: The specified checkpoint is "
+                       "not a snapshot (checkpoint number=%llu).\n",
+                       (unsigned long long)cno);
+                ret = -EINVAL;
+                goto out;
+        }
+        ret = nilfs_attach_checkpoint(NILFS_SB(s), cno, false, &root);
+        if (ret) {
+                printk(KERN_ERR "NILFS: error loading snapshot "
+                       "(checkpoint number=%llu).\n",
+               (unsigned long long)cno);
+                goto out;
+        }
+        ret = nilfs_get_root_dentry(s, root, root_dentry);
+        nilfs_put_root(root);
+ out:
+        return ret;
+}
+static int nilfs_tree_was_touched(struct dentry *root_dentry)
+{
+        return atomic_read(&root_dentry->d_count) > 1;
+}
+/**
+ * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint
+ * @root_dentry: root dentry of the tree to be shrunk
+ *
+ * This function returns true if the tree was in-use.
+ */
+static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
+{
+        if (have_submounts(root_dentry))
+                return true;
+        shrink_dcache_parent(root_dentry);
+        return nilfs_tree_was_touched(root_dentry);
+}
+int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
+{
+        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+        struct nilfs_root *root;
+        struct inode *inode;
+        struct dentry *dentry;
+        int ret;
+        if (cno < 0 || cno > nilfs->ns_cno)
+                return false;
+        if (cno >= nilfs_last_cno(nilfs))
+                return true;    /* protect recent checkpoints */
+        ret = false;
+        root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+        if (root) {
+                inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
+                if (inode) {
+                        dentry = d_find_alias(inode);
+                        if (dentry) {
+                                if (nilfs_tree_was_touched(dentry))
+                                        ret = nilfs_try_to_shrink_tree(dentry);
+                                dput(dentry);
+                        }
+                        iput(inode);
+                }
+                nilfs_put_root(root);
+        }
+        return ret;
+}
 /**
 * nilfs_fill_super() - initialize a super block instance
 * @sb: super_block
 * @data: mount options
 * @silent: silent mode flag
- * @nilfs: the_nilfs struct
 *
 * This function is called exclusively by nilfs->ns_mount_mutex.
 * So, the recovery process is protected from other simultaneous mounts.
 */
 static int
-nilfs_fill_super(struct super_block *sb, void *data, int silent,
+nilfs_fill_super(struct super_block *sb, void *data, int silent)
-                 struct the_nilfs *nilfs)
 {
+        struct the_nilfs *nilfs;
        struct nilfs_sb_info *sbi;
-        struct inode *root;
+        struct nilfs_root *fsroot;
+        struct backing_dev_info *bdi;
        __u64 cno;
        int err;
@@ -822,19 +911,21 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
                return -ENOMEM;
        sb->s_fs_info = sbi;
+        sbi->s_super = sb;
-        get_nilfs(nilfs);
+        nilfs = alloc_nilfs(sb->s_bdev);
+        if (!nilfs) {
+                err = -ENOMEM;
+                goto failed_sbi;
+        }
        sbi->s_nilfs = nilfs;
-        sbi->s_super = sb;
-        atomic_set(&sbi->s_count, 1);
        err = init_nilfs(nilfs, sbi, (char *)data);
        if (err)
-                goto failed_sbi;
+                goto failed_nilfs;
        spin_lock_init(&sbi->s_inode_lock);
        INIT_LIST_HEAD(&sbi->s_dirty_files);
-        INIT_LIST_HEAD(&sbi->s_list);
        /*
         * Following initialization is overlapped because
@@ -850,94 +941,59 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        sb->s_export_op = &nilfs_export_ops;
        sb->s_root = NULL;
        sb->s_time_gran = 1;
-        sb->s_bdi = nilfs->ns_bdi;
+        bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+        sb->s_bdi = bdi ? : &default_backing_dev_info;
        err = load_nilfs(nilfs, sbi);
        if (err)
-                goto failed_sbi;
+                goto failed_nilfs;
        cno = nilfs_last_cno(nilfs);
+        err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot);
-        if (sb->s_flags & MS_RDONLY) {
-                if (nilfs_test_opt(sbi, SNAPSHOT)) {
-                        down_read(&nilfs->ns_segctor_sem);
-                        err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
-                                                       sbi->s_snapshot_cno);
-                        up_read(&nilfs->ns_segctor_sem);
-                        if (err < 0) {
-                                if (err == -ENOENT)
-                                        err = -EINVAL;
-                                goto failed_sbi;
-                        }
-                        if (!err) {
-                                printk(KERN_ERR
-                                       "NILFS: The specified checkpoint is "
-                                       "not a snapshot "
-                                       "(checkpoint number=%llu).\n",
-                                       (unsigned long long)sbi->s_snapshot_cno);
-                                err = -EINVAL;
-                                goto failed_sbi;
-                        }
-                        cno = sbi->s_snapshot_cno;
-                }
-        }
-        err = nilfs_attach_checkpoint(sbi, cno);
        if (err) {
-                printk(KERN_ERR "NILFS: error loading a checkpoint"
+                printk(KERN_ERR "NILFS: error loading last checkpoint "
-                       " (checkpoint number=%llu).\n", (unsigned long long)cno);
+                       "(checkpoint number=%llu).\n", (unsigned long long)cno);
-                goto failed_sbi;
+                goto failed_unload;
        }
        if (!(sb->s_flags & MS_RDONLY)) {
-                err = nilfs_attach_segment_constructor(sbi);
+                err = nilfs_attach_segment_constructor(sbi, fsroot);
                if (err)
                        goto failed_checkpoint;
        }
-        root = nilfs_iget(sb, NILFS_ROOT_INO);
+        err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root);
-        if (IS_ERR(root)) {
+        if (err)
-                printk(KERN_ERR "NILFS: get root inode failed\n");
-                err = PTR_ERR(root);
-                goto failed_segctor;
-        }
-        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
-                iput(root);
-                printk(KERN_ERR "NILFS: corrupt root inode.\n");
-                err = -EINVAL;
-                goto failed_segctor;
-        }
-        sb->s_root = d_alloc_root(root);
-        if (!sb->s_root) {
-                iput(root);
-                printk(KERN_ERR "NILFS: get root dentry failed\n");
-                err = -ENOMEM;
                goto failed_segctor;
-        }
+        nilfs_put_root(fsroot);
        if (!(sb->s_flags & MS_RDONLY)) {
                down_write(&nilfs->ns_sem);
-                nilfs_setup_super(sbi);
+                nilfs_setup_super(sbi, true);
                up_write(&nilfs->ns_sem);
        }
-        down_write(&nilfs->ns_super_sem);
-        if (!nilfs_test_opt(sbi, SNAPSHOT))
-                nilfs->ns_current = sbi;
-        up_write(&nilfs->ns_super_sem);
        return 0;
 failed_segctor:
        nilfs_detach_segment_constructor(sbi);
 failed_checkpoint:
-        nilfs_detach_checkpoint(sbi);
+        nilfs_put_root(fsroot);
+ failed_unload:
+        iput(nilfs->ns_sufile);
+        iput(nilfs->ns_cpfile);
+        iput(nilfs->ns_dat);
+ failed_nilfs:
+        destroy_nilfs(nilfs);
 failed_sbi:
-        put_nilfs(nilfs);
        sb->s_fs_info = NULL;
-        nilfs_put_sbinfo(sbi);
+        kfree(sbi);
        return err;
 }
@@ -947,15 +1003,10 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        struct the_nilfs *nilfs = sbi->s_nilfs;
        unsigned long old_sb_flags;
        struct nilfs_mount_options old_opts;
-        int was_snapshot, err;
+        int err;
-        lock_kernel();
-        down_write(&nilfs->ns_super_sem);
        old_sb_flags = sb->s_flags;
        old_opts.mount_opt = sbi->s_mount_opt;
-        old_opts.snapshot_cno = sbi->s_snapshot_cno;
-        was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
        if (!parse_options(data, sb, 1)) {
                err = -EINVAL;
@@ -964,11 +1015,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
        err = -EINVAL;
-        if (was_snapshot && !(*flags & MS_RDONLY)) {
-                printk(KERN_ERR "NILFS (device %s): cannot remount snapshot "
-                       "read/write.\n", sb->s_id);
-                goto restore_opts;
-        }
        if (!nilfs_valid_fs(nilfs)) {
                printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -993,6 +1039,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                up_write(&nilfs->ns_sem);
        } else {
                __u64 features;
+                struct nilfs_root *root;
                /*
                 * Mounting a RDONLY partition read-write, so reread and
@@ -1014,25 +1061,21 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                sb->s_flags &= ~MS_RDONLY;
-                err = nilfs_attach_segment_constructor(sbi);
+                root = NILFS_I(sb->s_root->d_inode)->i_root;
+                err = nilfs_attach_segment_constructor(sbi, root);
                if (err)
                        goto restore_opts;
                down_write(&nilfs->ns_sem);
-                nilfs_setup_super(sbi);
+                nilfs_setup_super(sbi, true);
                up_write(&nilfs->ns_sem);
        }
 out:
-        up_write(&nilfs->ns_super_sem);
-        unlock_kernel();
        return 0;
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.mount_opt;
-        sbi->s_snapshot_cno = old_opts.snapshot_cno;
-        up_write(&nilfs->ns_super_sem);
-        unlock_kernel();
        return err;
 }
@@ -1052,7 +1095,7 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
 {
        char *p, *options = data;
        substring_t args[MAX_OPT_ARGS];
-        int option, token;
+        int token;
        int ret = 0;
        do {
@@ -1060,16 +1103,18 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
                if (p != NULL && *p) {
                        token = match_token(p, tokens, args);
                        if (token == Opt_snapshot) {
-                                if (!(sd->flags & MS_RDONLY))
+                                if (!(sd->flags & MS_RDONLY)) {
                                        ret++;
-                                else {
+                                } else {
-                                        ret = match_int(&args[0], &option);
+                                        sd->cno = simple_strtoull(args[0].from,
-                                        if (!ret) {
+                                                                  NULL, 0);
-                                                if (option > 0)
+                                        /*
-                                                        sd->cno = option;
+                                         * No need to see the end pointer;
-                                                else
+                                         * match_token() has done syntax
-                                                        ret++;
+                                         * checking.
-                                        }
+                                         */
+                                        if (sd->cno == 0)
+                                                ret++;
                                }
                        }
                        if (ret)
@@ -1086,43 +1131,33 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
 static int nilfs_set_bdev_super(struct super_block *s, void *data)
 {
-        struct nilfs_super_data *sd = data;
+        s->s_bdev = data;
-        s->s_bdev = sd->bdev;
        s->s_dev = s->s_bdev->bd_dev;
        return 0;
 }
 static int nilfs_test_bdev_super(struct super_block *s, void *data)
 {
-        struct nilfs_super_data *sd = data;
+        return (void *)s->s_bdev == data;
-        return sd->sbi && s->s_fs_info == (void *)sd->sbi;
 }
-static int
+static struct dentry *
-nilfs_get_sb(struct file_system_type *fs_type, int flags,
+nilfs_mount(struct file_system_type *fs_type, int flags,
-             const char *dev_name, void *data, struct vfsmount *mnt)
+             const char *dev_name, void *data)
 {
        struct nilfs_super_data sd;
        struct super_block *s;
        fmode_t mode = FMODE_READ;
-        struct the_nilfs *nilfs;
+        struct dentry *root_dentry;
-        int err, need_to_close = 1;
+        int err, s_new = false;
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
        sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
        if (IS_ERR(sd.bdev))
-                return PTR_ERR(sd.bdev);
+                return ERR_CAST(sd.bdev);
-        /*
-         * To get mount instance using sget() vfs-routine, NILFS needs
-         * much more information than normal filesystems to identify mount
-         * instance.  For snapshot mounts, not only a mount type (ro-mount
-         * or rw-mount) but also a checkpoint number is required.
-         */
        sd.cno = 0;
        sd.flags = flags;
        if (nilfs_identify((char *)data, &sd)) {
@@ -1130,101 +1165,91 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
                goto failed;
        }
-        nilfs = find_or_create_nilfs(sd.bdev);
-        if (!nilfs) {
-                err = -ENOMEM;
-                goto failed;
-        }
-        mutex_lock(&nilfs->ns_mount_mutex);
-        if (!sd.cno) {
-                /*
-                 * Check if an exclusive mount exists or not.
-                 * Snapshot mounts coexist with a current mount
-                 * (i.e. rw-mount or ro-mount), whereas rw-mount and
-                 * ro-mount are mutually exclusive.
-                 */
-                down_read(&nilfs->ns_super_sem);
-                if (nilfs->ns_current &&
-                    ((nilfs->ns_current->s_super->s_flags ^ flags)
-                     & MS_RDONLY)) {
-                        up_read(&nilfs->ns_super_sem);
-                        err = -EBUSY;
-                        goto failed_unlock;
-                }
-                up_read(&nilfs->ns_super_sem);
-        }
-        /*
-         * Find existing nilfs_sb_info struct
-         */
-        sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
        /*
-         * Get super block instance holding the nilfs_sb_info struct.
+         * once the super is inserted into the list by sget, s_umount
-         * A new instance is allocated if no existing mount is present or
+         * will protect the lockfs code from trying to start a snapshot
-         * existing instance has been unmounted.
+         * while we are mounting
         */
-        s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
+        mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
-        if (sd.sbi)
+        if (sd.bdev->bd_fsfreeze_count > 0) {
-                nilfs_put_sbinfo(sd.sbi);
+                mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
+                err = -EBUSY;
+                goto failed;
+        }
+        s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev);
+        mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
        if (IS_ERR(s)) {
                err = PTR_ERR(s);
-                goto failed_unlock;
+                goto failed;
        }
        if (!s->s_root) {
                char b[BDEVNAME_SIZE];
+                s_new = true;
                /* New superblock instance created */
                s->s_flags = flags;
                s->s_mode = mode;
                strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
                sb_set_blocksize(s, block_size(sd.bdev));
-                err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
+                err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-                                       nilfs);
                if (err)
-                        goto cancel_new;
+                        goto failed_super;
                s->s_flags |= MS_ACTIVE;
-                need_to_close = 0;
+        } else if (!sd.cno) {
+                int busy = false;
+                if (nilfs_tree_was_touched(s->s_root)) {
+                        busy = nilfs_try_to_shrink_tree(s->s_root);
+                        if (busy && (flags ^ s->s_flags) & MS_RDONLY) {
+                                printk(KERN_ERR "NILFS: the device already "
+                                       "has a %s mount.\n",
+                                       (s->s_flags & MS_RDONLY) ?
+                                       "read-only" : "read/write");
+                                err = -EBUSY;
+                                goto failed_super;
+                        }
+                }
+                if (!busy) {
+                        /*
+                         * Try remount to setup mount states if the current
+                         * tree is not mounted and only snapshots use this sb.
+                         */
+                        err = nilfs_remount(s, &flags, data);
+                        if (err)
+                                goto failed_super;
+                }
        }
-        mutex_unlock(&nilfs->ns_mount_mutex);
+        if (sd.cno) {
-        put_nilfs(nilfs);
+                err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
-        if (need_to_close)
+                if (err)
-                close_bdev_exclusive(sd.bdev, mode);
+                        goto failed_super;
-        simple_set_mnt(mnt, s);
+        } else {
-        return 0;
+                root_dentry = dget(s->s_root);
+        }
- failed_unlock:
+        if (!s_new)
-        mutex_unlock(&nilfs->ns_mount_mutex);
+                close_bdev_exclusive(sd.bdev, mode);
-        put_nilfs(nilfs);
- failed:
-        close_bdev_exclusive(sd.bdev, mode);
-        return err;
+        return root_dentry;
- cancel_new:
+ failed_super:
-        /* Abandoning the newly allocated superblock */
-        mutex_unlock(&nilfs->ns_mount_mutex);
-        put_nilfs(nilfs);
        deactivate_locked_super(s);
-        /*
-         * deactivate_locked_super() invokes close_bdev_exclusive().
+ failed:
-         * We must finish all post-cleaning before this call;
+        if (!s_new)
-         * put_nilfs() needs the block device.
+                close_bdev_exclusive(sd.bdev, mode);
-         */
+        return ERR_PTR(err);
-        return err;
 }
 struct file_system_type nilfs_fs_type = {
        .owner    = THIS_MODULE,
        .name     = "nilfs2",
-        .get_sb   = nilfs_get_sb,
+        .mount    = nilfs_mount,
        .kill_sb  = kill_block_super,
        .fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ba7c10c917fc..0254be2d73c6 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -35,9 +35,6 @@
 #include "segbuf.h"
-static LIST_HEAD(nilfs_objects);
-static DEFINE_SPINLOCK(nilfs_lock);
 static int nilfs_valid_sb(struct nilfs_super_block *sbp);
 void nilfs_set_last_segment(struct the_nilfs *nilfs,
@@ -61,16 +58,13 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
 }
 /**
- * alloc_nilfs - allocate the_nilfs structure
+ * alloc_nilfs - allocate a nilfs object
 * @bdev: block device to which the_nilfs is related
 *
- * alloc_nilfs() allocates memory for the_nilfs and
- * initializes its reference count and locks.
- *
 * Return Value: On success, pointer to the_nilfs is returned.
 * On error, NULL is returned.
 */
-static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 {
        struct the_nilfs *nilfs;
@@ -79,103 +73,38 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
                return NULL;
        nilfs->ns_bdev = bdev;
-        atomic_set(&nilfs->ns_count, 1);
        atomic_set(&nilfs->ns_ndirtyblks, 0);
        init_rwsem(&nilfs->ns_sem);
-        init_rwsem(&nilfs->ns_super_sem);
+        INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
-        mutex_init(&nilfs->ns_mount_mutex);
-        init_rwsem(&nilfs->ns_writer_sem);
-        INIT_LIST_HEAD(&nilfs->ns_list);
-        INIT_LIST_HEAD(&nilfs->ns_supers);
        spin_lock_init(&nilfs->ns_last_segment_lock);
-        nilfs->ns_gc_inodes_h = NULL;
+        nilfs->ns_cptree = RB_ROOT;
+        spin_lock_init(&nilfs->ns_cptree_lock);
        init_rwsem(&nilfs->ns_segctor_sem);
        return nilfs;
 }
 /**
- * find_or_create_nilfs - find or create nilfs object
+ * destroy_nilfs - destroy nilfs object
- * @bdev: block device to which the_nilfs is related
+ * @nilfs: nilfs object to be released
- *
- * find_nilfs() looks up an existent nilfs object created on the
- * device and gets the reference count of the object.  If no nilfs object
- * is found on the device, a new nilfs object is allocated.
- *
- * Return Value: On success, pointer to the nilfs object is returned.
- * On error, NULL is returned.
- */
-struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
-{
-        struct the_nilfs *nilfs, *new = NULL;
- retry:
-        spin_lock(&nilfs_lock);
-        list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
-                if (nilfs->ns_bdev == bdev) {
-                        get_nilfs(nilfs);
-                        spin_unlock(&nilfs_lock);
-                        if (new)
-                                put_nilfs(new);
-                        return nilfs; /* existing object */
-                }
-        }
-        if (new) {
-                list_add_tail(&new->ns_list, &nilfs_objects);
-                spin_unlock(&nilfs_lock);
-                return new; /* new object */
-        }
-        spin_unlock(&nilfs_lock);
-        new = alloc_nilfs(bdev);
-        if (new)
-                goto retry;
-        return NULL; /* insufficient memory */
-}
-/**
- * put_nilfs - release a reference to the_nilfs
- * @nilfs: the_nilfs structure to be released
- *
- * put_nilfs() decrements a reference counter of the_nilfs.
- * If the reference count reaches zero, the_nilfs is freed.
 */
-void put_nilfs(struct the_nilfs *nilfs)
+void destroy_nilfs(struct the_nilfs *nilfs)
 {
-        spin_lock(&nilfs_lock);
-        if (!atomic_dec_and_test(&nilfs->ns_count)) {
-                spin_unlock(&nilfs_lock);
-                return;
-        }
-        list_del_init(&nilfs->ns_list);
-        spin_unlock(&nilfs_lock);
-        /*
-         * Increment of ns_count never occurs below because the caller
-         * of get_nilfs() holds at least one reference to the_nilfs.
-         * Thus its exclusion control is not required here.
-         */
        might_sleep();
-        if (nilfs_loaded(nilfs)) {
-                nilfs_mdt_destroy(nilfs->ns_sufile);
-                nilfs_mdt_destroy(nilfs->ns_cpfile);
-                nilfs_mdt_destroy(nilfs->ns_dat);
-                nilfs_mdt_destroy(nilfs->ns_gc_dat);
-        }
        if (nilfs_init(nilfs)) {
-                nilfs_destroy_gccache(nilfs);
                brelse(nilfs->ns_sbh[0]);
                brelse(nilfs->ns_sbh[1]);
        }
        kfree(nilfs);
 }
-static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
+static int nilfs_load_super_root(struct the_nilfs *nilfs,
+                                 struct super_block *sb, sector_t sr_block)
 {
        struct buffer_head *bh_sr;
        struct nilfs_super_root *raw_sr;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        struct nilfs_inode *rawi;
        unsigned dat_entry_size, segment_usage_size, checkpoint_size;
        unsigned inode_size;
        int err;
@@ -192,40 +121,22 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
        inode_size = nilfs->ns_inode_size;
-        err = -ENOMEM;
+        rawi = (void *)bh_sr->b_data + NILFS_SR_DAT_OFFSET(inode_size);
-        nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size);
+        err = nilfs_dat_read(sb, dat_entry_size, rawi, &nilfs->ns_dat);
-        if (unlikely(!nilfs->ns_dat))
+        if (err)
                goto failed;
-        nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size);
+        rawi = (void *)bh_sr->b_data + NILFS_SR_CPFILE_OFFSET(inode_size);
-        if (unlikely(!nilfs->ns_gc_dat))
+        err = nilfs_cpfile_read(sb, checkpoint_size, rawi, &nilfs->ns_cpfile);
+        if (err)
                goto failed_dat;
-        nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size);
+        rawi = (void *)bh_sr->b_data + NILFS_SR_SUFILE_OFFSET(inode_size);
-        if (unlikely(!nilfs->ns_cpfile))
+        err = nilfs_sufile_read(sb, segment_usage_size, rawi,
-                goto failed_gc_dat;
+                                &nilfs->ns_sufile);
+        if (err)
-        nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
-        if (unlikely(!nilfs->ns_sufile))
                goto failed_cpfile;
-        nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
-        err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
-                             NILFS_SR_DAT_OFFSET(inode_size));
-        if (unlikely(err))
-                goto failed_sufile;
-        err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
-                                NILFS_SR_CPFILE_OFFSET(inode_size));
-        if (unlikely(err))
-                goto failed_sufile;
-        err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
-                                NILFS_SR_SUFILE_OFFSET(inode_size));
-        if (unlikely(err))
-                goto failed_sufile;
        raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
        nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
@@ -233,17 +144,11 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
        brelse(bh_sr);
        return err;
- failed_sufile:
-        nilfs_mdt_destroy(nilfs->ns_sufile);
 failed_cpfile:
-        nilfs_mdt_destroy(nilfs->ns_cpfile);
+        iput(nilfs->ns_cpfile);
- failed_gc_dat:
-        nilfs_mdt_destroy(nilfs->ns_gc_dat);
 failed_dat:
-        nilfs_mdt_destroy(nilfs->ns_dat);
+        iput(nilfs->ns_dat);
        goto failed;
 }
@@ -306,15 +211,6 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        int valid_fs = nilfs_valid_fs(nilfs);
        int err;
-        if (nilfs_loaded(nilfs)) {
-                if (valid_fs ||
-                    ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
-                        return 0;
-                printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
-                       "recovery state.\n");
-                return -EINVAL;
-        }
        if (!valid_fs) {
                printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
                if (s_flags & MS_RDONLY) {
@@ -375,7 +271,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
                        goto scan_error;
        }
-        err = nilfs_load_super_root(nilfs, ri.ri_super_root);
+        err = nilfs_load_super_root(nilfs, sbi->s_super, ri.ri_super_root);
        if (unlikely(err)) {
                printk(KERN_ERR "NILFS: error loading super root.\n");
                goto failed;
@@ -443,10 +339,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        goto failed;
 failed_unload:
-        nilfs_mdt_destroy(nilfs->ns_cpfile);
+        iput(nilfs->ns_cpfile);
-        nilfs_mdt_destroy(nilfs->ns_sufile);
+        iput(nilfs->ns_sufile);
-        nilfs_mdt_destroy(nilfs->ns_dat);
+        iput(nilfs->ns_dat);
-        nilfs_mdt_destroy(nilfs->ns_gc_dat);
 failed:
        nilfs_clear_recovery_info(&ri);
@@ -468,8 +363,8 @@ static unsigned long long nilfs_max_size(unsigned int blkbits)
 static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
                                   struct nilfs_super_block *sbp)
 {
-        if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) {
+        if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
-                printk(KERN_ERR "NILFS: revision mismatch "
+                printk(KERN_ERR "NILFS: unsupported revision "
                       "(superblock rev.=%d.%d, current rev.=%d.%d). "
                       "Please check the version of mkfs.nilfs.\n",
                       le32_to_cpu(sbp->s_rev_level),
@@ -631,12 +526,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
 *
 * init_nilfs() performs common initialization per block device (e.g.
 * reading the super block, getting disk layout information, initializing
- * shared fields in the_nilfs). It takes on some portion of the jobs
+ * shared fields in the_nilfs).
- * typically done by a fill_super() routine. This division arises from
- * the nature that multiple NILFS instances may be simultaneously
- * mounted on a device.
- * For multiple mounts on the same device, only the first mount
- * invokes these tasks.
 *
 * Return Value: On success, 0 is returned. On error, a negative error
 * code is returned.
@@ -645,32 +535,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 {
        struct super_block *sb = sbi->s_super;
        struct nilfs_super_block *sbp;
-        struct backing_dev_info *bdi;
        int blocksize;
        int err;
        down_write(&nilfs->ns_sem);
-        if (nilfs_init(nilfs)) {
-                /* Load values from existing the_nilfs */
-                sbp = nilfs->ns_sbp[0];
-                err = nilfs_store_magic_and_option(sb, sbp, data);
-                if (err)
-                        goto out;
-                err = nilfs_check_feature_compatibility(sb, sbp);
-                if (err)
-                        goto out;
-                blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
-                if (sb->s_blocksize != blocksize &&
-                    !sb_set_blocksize(sb, blocksize)) {
-                        printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
-                               blocksize);
-                        err = -EINVAL;
-                }
-                sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
-                goto out;
-        }
        blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
        if (!blocksize) {
@@ -729,18 +597,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
-        bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
-        nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
        err = nilfs_store_log_cursor(nilfs, sbp);
        if (err)
                goto failed_sbh;
-        /* Initialize gcinode cache */
-        err = nilfs_init_gccache(nilfs);
-        if (err)
-                goto failed_sbh;
        set_nilfs_init(nilfs);
        err = 0;
 out:
@@ -775,9 +635,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                        ret = blkdev_issue_discard(nilfs->ns_bdev,
                                                   start * sects_per_block,
                                                   nblocks * sects_per_block,
-                                                   GFP_NOFS,
+                                                   GFP_NOFS, 0);
-                                                   BLKDEV_IFL_WAIT |
-                                                   BLKDEV_IFL_BARRIER);
                        if (ret < 0)
                                return ret;
                        nblocks = 0;
@@ -787,8 +645,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                ret = blkdev_issue_discard(nilfs->ns_bdev,
                                           start * sects_per_block,
                                           nblocks * sects_per_block,
-                                           GFP_NOFS,
+                                           GFP_NOFS, 0);
-                                          BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
        return ret;
 }
@@ -815,79 +672,92 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
        return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
 }
-/**
+struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
- * nilfs_find_sbinfo - find existing nilfs_sb_info structure
- * @nilfs: nilfs object
- * @rw_mount: mount type (non-zero value for read/write mount)
- * @cno: checkpoint number (zero for read-only mount)
- *
- * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
- * @rw_mount and @cno (in case of snapshots) matched.  If no instance
- * was found, NULL is returned.  Although the super block instance can
- * be unmounted after this function returns, the nilfs_sb_info struct
- * is kept on memory until nilfs_put_sbinfo() is called.
- */
-struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
-                                        int rw_mount, __u64 cno)
 {
-        struct nilfs_sb_info *sbi;
+        struct rb_node *n;
+        struct nilfs_root *root;
-        down_read(&nilfs->ns_super_sem);
-        /*
+        spin_lock(&nilfs->ns_cptree_lock);
-         * The SNAPSHOT flag and sb->s_flags are supposed to be
+        n = nilfs->ns_cptree.rb_node;
-         * protected with nilfs->ns_super_sem.
+        while (n) {
-         */
+                root = rb_entry(n, struct nilfs_root, rb_node);
-        sbi = nilfs->ns_current;
-        if (rw_mount) {
+                if (cno < root->cno) {
-                if (sbi && !(sbi->s_super->s_flags & MS_RDONLY))
+                        n = n->rb_left;
-                        goto found; /* read/write mount */
+                } else if (cno > root->cno) {
-                else
+                        n = n->rb_right;
-                        goto out;
+                } else {
-        } else if (cno == 0) {
+                        atomic_inc(&root->count);
-                if (sbi && (sbi->s_super->s_flags & MS_RDONLY))
+                        spin_unlock(&nilfs->ns_cptree_lock);
-                        goto found; /* read-only mount */
+                        return root;
-                else
+                }
-                        goto out;
        }
+        spin_unlock(&nilfs->ns_cptree_lock);
-        list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
-                if (nilfs_test_opt(sbi, SNAPSHOT) &&
-                    sbi->s_snapshot_cno == cno)
-                        goto found; /* snapshot mount */
-        }
- out:
-        up_read(&nilfs->ns_super_sem);
        return NULL;
- found:
-        atomic_inc(&sbi->s_count);
-        up_read(&nilfs->ns_super_sem);
-        return sbi;
 }
-int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
+struct nilfs_root *
-                                int snapshot_mount)
+nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
 {
-        struct nilfs_sb_info *sbi;
+        struct rb_node **p, *parent;
-        int ret = 0;
+        struct nilfs_root *root, *new;
-        down_read(&nilfs->ns_super_sem);
+        root = nilfs_lookup_root(nilfs, cno);
-        if (cno == 0 || cno > nilfs->ns_cno)
+        if (root)
-                goto out_unlock;
+                return root;
-        list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
+        new = kmalloc(sizeof(*root), GFP_KERNEL);
-                if (sbi->s_snapshot_cno == cno &&
+        if (!new)
-                    (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) {
+                return NULL;
-                                        /* exclude read-only mounts */
-                        ret++;
+        spin_lock(&nilfs->ns_cptree_lock);
-                        break;
+        p = &nilfs->ns_cptree.rb_node;
+        parent = NULL;
+        while (*p) {
+                parent = *p;
+                root = rb_entry(parent, struct nilfs_root, rb_node);
+                if (cno < root->cno) {
+                        p = &(*p)->rb_left;
+                } else if (cno > root->cno) {
+                        p = &(*p)->rb_right;
+                } else {
+                        atomic_inc(&root->count);
+                        spin_unlock(&nilfs->ns_cptree_lock);
+                        kfree(new);
+                        return root;
                }
        }
-        /* for protecting recent checkpoints */
-        if (cno >= nilfs_last_cno(nilfs))
-                ret++;
- out_unlock:
+        new->cno = cno;
-        up_read(&nilfs->ns_super_sem);
+        new->ifile = NULL;
-        return ret;
+        new->nilfs = nilfs;
+        atomic_set(&new->count, 1);
+        atomic_set(&new->inodes_count, 0);
+        atomic_set(&new->blocks_count, 0);
+        rb_link_node(&new->rb_node, parent, p);
+        rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
+        spin_unlock(&nilfs->ns_cptree_lock);
+        return new;
+}
+void nilfs_put_root(struct nilfs_root *root)
+{
+        if (atomic_dec_and_test(&root->count)) {
+                struct the_nilfs *nilfs = root->nilfs;
+                spin_lock(&nilfs->ns_cptree_lock);
+                rb_erase(&root->rb_node, &nilfs->ns_cptree);
+                spin_unlock(&nilfs->ns_cptree_lock);
+                if (root->ifile)
+                        iput(root->ifile);
+                kfree(root);
+        }
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index f785a7b0ab99..69226e14b745 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -26,6 +26,7 @@
 #include <linux/types.h>
 #include <linux/buffer_head.h>
+#include <linux/rbtree.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
@@ -45,22 +46,13 @@ enum {
 /**
 * struct the_nilfs - struct to supervise multiple nilfs mount points
 * @ns_flags: flags
- * @ns_count: reference count
- * @ns_list: list head for nilfs_list
 * @ns_bdev: block device
- * @ns_bdi: backing dev info
- * @ns_writer: back pointer to writable nilfs_sb_info
 * @ns_sem: semaphore for shared states
- * @ns_super_sem: semaphore for global operations across super block instances
- * @ns_mount_mutex: mutex protecting mount process of nilfs
- * @ns_writer_sem: semaphore protecting ns_writer attach/detach
- * @ns_current: back pointer to current mount
 * @ns_sbh: buffer heads of on-disk super blocks
 * @ns_sbp: pointers to super block data
 * @ns_sbwtime: previous write time of super block
 * @ns_sbwcount: write count of super block
 * @ns_sbsize: size of valid data in super block
- * @ns_supers: list of nilfs super block structs
 * @ns_seg_seq: segment sequence counter
 * @ns_segnum: index number of the latest full segment.
 * @ns_nextnum: index number of the full segment index to be used next
@@ -79,9 +71,9 @@ enum {
 * @ns_dat: DAT file inode
 * @ns_cpfile: checkpoint file inode
 * @ns_sufile: segusage file inode
- * @ns_gc_dat: shadow inode of the DAT file inode for GC
+ * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
+ * @ns_cptree_lock: lock protecting @ns_cptree
 * @ns_gc_inodes: dummy inodes to keep live blocks
- * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
 * @ns_blocksize_bits: bit length of block size
 * @ns_blocksize: block size
 * @ns_nsegments: number of segments in filesystem
@@ -95,22 +87,9 @@ enum {
 */
 struct the_nilfs {
        unsigned long           ns_flags;
-        atomic_t                ns_count;
-        struct list_head        ns_list;
        struct block_device    *ns_bdev;
-        struct backing_dev_info *ns_bdi;
-        struct nilfs_sb_info   *ns_writer;
        struct rw_semaphore     ns_sem;
-        struct rw_semaphore     ns_super_sem;
-        struct mutex            ns_mount_mutex;
-        struct rw_semaphore     ns_writer_sem;
-        /*
-         * components protected by ns_super_sem
-         */
-        struct nilfs_sb_info   *ns_current;
-        struct list_head        ns_supers;
        /*
         * used for
@@ -163,11 +142,13 @@ struct the_nilfs {
        struct inode           *ns_dat;
        struct inode           *ns_cpfile;
        struct inode           *ns_sufile;
-        struct inode           *ns_gc_dat;
-        /* GC inode list and hash table head */
+        /* Checkpoint tree */
+        struct rb_root          ns_cptree;
+        spinlock_t              ns_cptree_lock;
+        /* GC inode list */
        struct list_head        ns_gc_inodes;
-        struct hlist_head      *ns_gc_inodes_h;
        /* Disk layout information (static) */
        unsigned int            ns_blocksize_bits;
@@ -182,9 +163,6 @@ struct the_nilfs {
        u32                     ns_crc_seed;
 };
-#define NILFS_GCINODE_HASH_BITS         8
-#define NILFS_GCINODE_HASH_SIZE         (1<<NILFS_GCINODE_HASH_BITS)
 #define THE_NILFS_FNS(bit, name)                                        \
 static inline void set_nilfs_##name(struct the_nilfs *nilfs)            \
 {                                                                       \
@@ -205,6 +183,32 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
 THE_NILFS_FNS(SB_DIRTY, sb_dirty)
+/**
+ * struct nilfs_root - nilfs root object
+ * @cno: checkpoint number
+ * @rb_node: red-black tree node
+ * @count: refcount of this structure
+ * @nilfs: nilfs object
+ * @ifile: inode file
+ * @root: root inode
+ * @inodes_count: number of inodes
+ * @blocks_count: number of blocks (Reserved)
+ */
+struct nilfs_root {
+        __u64 cno;
+        struct rb_node rb_node;
+        atomic_t count;
+        struct the_nilfs *nilfs;
+        struct inode *ifile;
+        atomic_t inodes_count;
+        atomic_t blocks_count;
+};
+/* Special checkpoint number */
+#define NILFS_CPTREE_CURRENT_CNO        0
 /* Minimum interval of periodical update of superblocks (in seconds) */
 #define NILFS_SB_FREQ           10
@@ -221,46 +225,25 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
 }
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
-struct the_nilfs *find_or_create_nilfs(struct block_device *);
+struct the_nilfs *alloc_nilfs(struct block_device *bdev);
-void put_nilfs(struct the_nilfs *);
+void destroy_nilfs(struct the_nilfs *nilfs);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
 int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
+struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
+                                             __u64 cno);
+void nilfs_put_root(struct nilfs_root *root);
 struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
-int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
 int nilfs_near_disk_full(struct the_nilfs *);
 void nilfs_fall_back_super_block(struct the_nilfs *);
 void nilfs_swap_super_block(struct the_nilfs *);
-static inline void get_nilfs(struct the_nilfs *nilfs)
+static inline void nilfs_get_root(struct nilfs_root *root)
-{
-        /* Caller must have at least one reference of the_nilfs. */
-        atomic_inc(&nilfs->ns_count);
-}
-static inline void
-nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
-{
-        down_write(&nilfs->ns_writer_sem);
-        nilfs->ns_writer = sbi;
-        up_write(&nilfs->ns_writer_sem);
-}
-static inline void
-nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
-{
-        down_write(&nilfs->ns_writer_sem);
-        if (sbi == nilfs->ns_writer)
-                nilfs->ns_writer = NULL;
-        up_write(&nilfs->ns_writer_sem);
-}
-static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
 {
-        if (atomic_dec_and_test(&sbi->s_count))
+        atomic_inc(&root->count);
-                kfree(sbi);
 }
 static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
diff --git a/fs/no-block.c b/fs/no-block.c
index d269a93d3467..6e40e42a43de 100644
--- a/fs/no-block.c
+++ b/fs/no-block.c
@@ -19,4 +19,5 @@ static int no_blkdev_open(struct inode * inode, struct file * filp)
 const struct file_operations def_blk_fops = {
        .open           = no_blkdev_open,
+        .llseek         = noop_llseek,
 };
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index b388443c3a09..22c629eedd82 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,4 +3,4 @@ config FSNOTIFY
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
-#source "fs/notify/fanotify/Kconfig"
+source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 85366c78cc37..b04f88eed09e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -131,6 +131,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
        BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
        BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
        BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
+        BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
@@ -160,20 +161,21 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
                                       __u32 event_mask, void *data, int data_type)
 {
        __u32 marks_mask, marks_ignored_mask;
+        struct path *path = data;
        pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
                 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
                 inode_mark, vfsmnt_mark, event_mask, data, data_type);
-        /* sorry, fanotify only gives a damn about files and dirs */
-        if (!S_ISREG(to_tell->i_mode) &&
-            !S_ISDIR(to_tell->i_mode))
-                return false;
        /* if we don't have enough info to send an event to userspace say no */
        if (data_type != FSNOTIFY_EVENT_PATH)
                return false;
+        /* sorry, fanotify only gives a damn about files and dirs */
+        if (!S_ISREG(path->dentry->d_inode->i_mode) &&
+            !S_ISDIR(path->dentry->d_inode->i_mode))
+                return false;
        if (inode_mark && vfsmnt_mark) {
                marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
                marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
@@ -194,16 +196,29 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
                BUG();
        }
+        if (S_ISDIR(path->dentry->d_inode->i_mode) &&
+            (marks_ignored_mask & FS_ISDIR))
+                return false;
        if (event_mask & marks_mask & ~marks_ignored_mask)
                return true;
        return false;
 }
+static void fanotify_free_group_priv(struct fsnotify_group *group)
+{
+        struct user_struct *user;
+        user = group->fanotify_data.user;
+        atomic_dec(&user->fanotify_listeners);
+        free_uid(user);
+}
 const struct fsnotify_ops fanotify_fsnotify_ops = {
        .handle_event = fanotify_handle_event,
        .should_send_event = fanotify_should_send_event,
-        .free_group_priv = NULL,
+        .free_group_priv = fanotify_free_group_priv,
        .free_event_priv = NULL,
        .freeing_mark = NULL,
 };
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 5ed8e58d7bfc..063224812b7e 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -16,6 +16,10 @@
 #include <asm/ioctls.h>
+#define FANOTIFY_DEFAULT_MAX_EVENTS     16384
+#define FANOTIFY_DEFAULT_MAX_MARKS      8192
+#define FANOTIFY_DEFAULT_MAX_LISTENERS  128
 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 static struct kmem_cache *fanotify_mark_cache __read_mostly;
@@ -326,7 +330,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
                ret = -EAGAIN;
                if (file->f_flags & O_NONBLOCK)
                        break;
-                ret = -EINTR;
+                ret = -ERESTARTSYS;
                if (signal_pending(current))
                        break;
@@ -372,11 +376,10 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
 static int fanotify_release(struct inode *ignored, struct file *file)
 {
        struct fsnotify_group *group = file->private_data;
-        struct fanotify_response_event *re, *lre;
-        pr_debug("%s: file=%p group=%p\n", __func__, file, group);
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        struct fanotify_response_event *re, *lre;
        mutex_lock(&group->fanotify_data.access_mutex);
        group->fanotify_data.bypass_perm = true;
@@ -433,6 +436,7 @@ static const struct file_operations fanotify_fops = {
        .release        = fanotify_release,
        .unlocked_ioctl = fanotify_ioctl,
        .compat_ioctl   = fanotify_ioctl,
+        .llseek         = noop_llseek,
 };
 static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
@@ -553,18 +557,24 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
                                       __u32 mask,
                                       unsigned int flags)
 {
-        __u32 oldmask;
+        __u32 oldmask = -1;
        spin_lock(&fsn_mark->lock);
        if (!(flags & FAN_MARK_IGNORED_MASK)) {
                oldmask = fsn_mark->mask;
                fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
        } else {
-                oldmask = fsn_mark->ignored_mask;
+                __u32 tmask = fsn_mark->ignored_mask | mask;
-                fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask | mask));
+                fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
                if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
                        fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
        }
+        if (!(flags & FAN_MARK_ONDIR)) {
+                __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
+                fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+        }
        spin_unlock(&fsn_mark->lock);
        return mask & ~oldmask;
@@ -581,6 +591,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
        if (!fsn_mark) {
                int ret;
+                if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+                        return -ENOSPC;
                fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
                if (!fsn_mark)
                        return -ENOMEM;
@@ -609,10 +622,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
        pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
+        /*
+         * If some other task has this inode open for write we should not add
+         * an ignored mark, unless that ignored mark is supposed to survive
+         * modification changes anyway.
+         */
+        if ((flags & FAN_MARK_IGNORED_MASK) &&
+            !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+            (atomic_read(&inode->i_writecount) > 0))
+                return 0;
        fsn_mark = fsnotify_find_inode_mark(group, inode);
        if (!fsn_mark) {
                int ret;
+                if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+                        return -ENOSPC;
                fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
                if (!fsn_mark)
                        return -ENOMEM;
@@ -636,6 +662,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 {
        struct fsnotify_group *group;
        int f_flags, fd;
+        struct user_struct *user;
        pr_debug("%s: flags=%d event_f_flags=%d\n",
                __func__, flags, event_f_flags);
@@ -646,6 +673,12 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        if (flags & ~FAN_ALL_INIT_FLAGS)
                return -EINVAL;
+        user = get_current_user();
+        if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
+                free_uid(user);
+                return -EMFILE;
+        }
        f_flags = O_RDWR | FMODE_NONOTIFY;
        if (flags & FAN_CLOEXEC)
                f_flags |= O_CLOEXEC;
@@ -657,12 +690,47 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        if (IS_ERR(group))
                return PTR_ERR(group);
+        group->fanotify_data.user = user;
+        atomic_inc(&user->fanotify_listeners);
        group->fanotify_data.f_flags = event_f_flags;
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
        mutex_init(&group->fanotify_data.access_mutex);
        init_waitqueue_head(&group->fanotify_data.access_waitq);
        INIT_LIST_HEAD(&group->fanotify_data.access_list);
 #endif
+        switch (flags & FAN_ALL_CLASS_BITS) {
+        case FAN_CLASS_NOTIF:
+                group->priority = FS_PRIO_0;
+                break;
+        case FAN_CLASS_CONTENT:
+                group->priority = FS_PRIO_1;
+                break;
+        case FAN_CLASS_PRE_CONTENT:
+                group->priority = FS_PRIO_2;
+                break;
+        default:
+                fd = -EINVAL;
+                goto out_put_group;
+        }
+        if (flags & FAN_UNLIMITED_QUEUE) {
+                fd = -EPERM;
+                if (!capable(CAP_SYS_ADMIN))
+                        goto out_put_group;
+                group->max_events = UINT_MAX;
+        } else {
+                group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+        }
+        if (flags & FAN_UNLIMITED_MARKS) {
+                fd = -EPERM;
+                if (!capable(CAP_SYS_ADMIN))
+                        goto out_put_group;
+                group->fanotify_data.max_marks = UINT_MAX;
+        } else {
+                group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
+        }
        fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
        if (fd < 0)
@@ -703,6 +771,12 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
        default:
                return -EINVAL;
        }
+        if (mask & FAN_ONDIR) {
+                flags |= FAN_MARK_ONDIR;
+                mask &= ~FAN_ONDIR;
+        }
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
        if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
 #else
@@ -718,6 +792,16 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
        ret = -EINVAL;
        if (unlikely(filp->f_op != &fanotify_fops))
                goto fput_and_out;
+        group = filp->private_data;
+        /*
+         * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
+         * allowed to set permissions events.
+         */
+        ret = -EINVAL;
+        if (mask & FAN_ALL_PERM_EVENTS &&
+            group->priority == FS_PRIO_0)
+                goto fput_and_out;
        ret = fanotify_find_path(dfd, pathname, &path, flags);
        if (ret)
@@ -728,7 +812,6 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
                inode = path.dentry->d_inode;
        else
                mnt = path.mnt;
-        group = filp->private_data;
        /* create/update an inode mark */
        switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 36802420d69a..20dc218707ca 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -84,59 +84,39 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 }
 /* Notify this dentry's parent about a child's events. */
-void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
+int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
        struct dentry *parent;
        struct inode *p_inode;
-        bool send = false;
+        int ret = 0;
-        bool should_update_children = false;
        if (!dentry)
                dentry = path->dentry;
        if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
-                return;
+                return 0;
-        spin_lock(&dentry->d_lock);
+        parent = dget_parent(dentry);
-        parent = dentry->d_parent;
        p_inode = parent->d_inode;
-        if (fsnotify_inode_watches_children(p_inode)) {
+        if (unlikely(!fsnotify_inode_watches_children(p_inode)))
-                if (p_inode->i_fsnotify_mask & mask) {
+                __fsnotify_update_child_dentry_flags(p_inode);
-                        dget(parent);
+        else if (p_inode->i_fsnotify_mask & mask) {
-                        send = true;
-                }
-        } else {
-                /*
-                 * The parent doesn't care about events on it's children but
-                 * at least one child thought it did.  We need to run all the
-                 * children and update their d_flags to let them know p_inode
-                 * doesn't care about them any more.
-                 */
-                dget(parent);
-                should_update_children = true;
-        }
-        spin_unlock(&dentry->d_lock);
-        if (send) {
                /* we are notifying a parent so come up with the new mask which
                 * specifies these are events which came from a child. */
                mask |= FS_EVENT_ON_CHILD;
                if (path)
-                        fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
+                        ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
-                                 dentry->d_name.name, 0);
+                                       dentry->d_name.name, 0);
                else
-                        fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+                        ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
-                                 dentry->d_name.name, 0);
+                                       dentry->d_name.name, 0);
-                dput(parent);
        }
-        if (unlikely(should_update_children)) {
+        dput(parent);
-                __fsnotify_update_child_dentry_flags(p_inode);
-                dput(parent);
+        return ret;
-        }
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
@@ -275,20 +255,23 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
                if (inode_group > vfsmount_group) {
                        /* handle inode */
-                        send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
+                        ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
-                                      data_is, cookie, file_name, &event);
+                                            data_is, cookie, file_name, &event);
                        /* we didn't use the vfsmount_mark */
                        vfsmount_group = NULL;
                } else if (vfsmount_group > inode_group) {
-                        send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
+                        ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
-                                      data_is, cookie, file_name, &event);
+                                            data_is, cookie, file_name, &event);
                        inode_group = NULL;
                } else {
-                        send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
+                        ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
-                                      mask, data, data_is, cookie, file_name,
+                                            mask, data, data_is, cookie, file_name,
-                                      &event);
+                                            &event);
                }
+                if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
+                        goto out;
                if (inode_group)
                        inode_node = srcu_dereference(inode_node->next,
                                                      &fsnotify_mark_srcu);
@@ -296,7 +279,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
                        vfsmount_node = srcu_dereference(vfsmount_node->next,
                                                         &fsnotify_mark_srcu);
        }
+        ret = 0;
+out:
        srcu_read_unlock(&fsnotify_mark_srcu, idx);
        /*
         * fsnotify_create_event() took a reference so the event can't be cleaned
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 33297c005060..4c29fcf557d1 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -177,7 +177,8 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
 * Attach an initialized mark to a given inode.
 * These marks may be used for the fsnotify backend to determine which
 * event types should be delivered to which group and for which inodes.  These
- * marks are ordered according to the group's location in memory.
+ * marks are ordered according to priority, highest number first, and then by
+ * the group's location in memory.
 */
 int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                            struct fsnotify_group *group, struct inode *inode,
@@ -211,7 +212,11 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                        goto out;
                }
-                if (mark->group < lmark->group)
+                if (mark->group->priority < lmark->group->priority)
+                        continue;
+                if ((mark->group->priority == lmark->group->priority) &&
+                    (mark->group < lmark->group))
                        continue;
                hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
@@ -240,6 +245,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
 {
        struct inode *inode, *next_i, *need_iput = NULL;
+        spin_lock(&inode_lock);
        list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
                struct inode *need_iput_tmp;
@@ -297,4 +303,5 @@ void fsnotify_unmount_inodes(struct list_head *list)
                spin_lock(&inode_lock);
        }
+        spin_unlock(&inode_lock);
 }
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index bf7f6d776c31..444c305a468c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -344,6 +344,7 @@ static const struct file_operations inotify_fops = {
        .release        = inotify_release,
        .unlocked_ioctl = inotify_ioctl,
        .compat_ioctl   = inotify_ioctl,
+        .llseek         = noop_llseek,
 };
@@ -861,7 +862,7 @@ static int __init inotify_user_setup(void)
        BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
        BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
        BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
-        BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
+        BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
        BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
        BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 56772b578fbd..85eebff6d0d7 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -169,7 +169,11 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
                        goto out;
                }
-                if (mark->group < lmark->group)
+                if (mark->group->priority < lmark->group->priority)
+                        continue;
+                if ((mark->group->priority == lmark->group->priority) &&
+                    (mark->group < lmark->group))
                        continue;
                hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 512806171bfa..a30ecacc01f2 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -30,7 +30,6 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/moduleparam.h>
-#include <linux/smp_lock.h>
 #include <linux/bitmap.h>
 #include "sysctl.h"
@@ -445,7 +444,6 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
        ntfs_debug("Entering with remount options string: %s", opt);
-        lock_kernel();
 #ifndef NTFS_RW
        /* For read-only compiled driver, enforce read-only flag. */
        *flags |= MS_RDONLY;
@@ -469,18 +467,15 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                if (NVolErrors(vol)) {
                        ntfs_error(sb, "Volume has errors and is read-only%s",
                                        es);
-                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_IS_DIRTY) {
                        ntfs_error(sb, "Volume is dirty and read-only%s", es);
-                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
                        ntfs_error(sb, "Volume has been modified by chkdsk "
                                        "and is read-only%s", es);
-                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -488,13 +483,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                                        "(0x%x) and is read-only%s",
                                        (unsigned)le16_to_cpu(vol->vol_flags),
                                        es);
-                        unlock_kernel();
                        return -EROFS;
                }
                if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
                        ntfs_error(sb, "Failed to set dirty bit in volume "
                                        "information flags%s", es);
-                        unlock_kernel();
                        return -EROFS;
                }
 #if 0
@@ -514,21 +507,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                        ntfs_error(sb, "Failed to empty journal $LogFile%s",
                                        es);
                        NVolSetErrors(vol);
-                        unlock_kernel();
                        return -EROFS;
                }
                if (!ntfs_mark_quotas_out_of_date(vol)) {
                        ntfs_error(sb, "Failed to mark quotas out of date%s",
                                        es);
                        NVolSetErrors(vol);
-                        unlock_kernel();
                        return -EROFS;
                }
                if (!ntfs_stamp_usnjrnl(vol)) {
                        ntfs_error(sb, "Failed to stamp transation log "
                                        "($UsnJrnl)%s", es);
                        NVolSetErrors(vol);
-                        unlock_kernel();
                        return -EROFS;
                }
        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -544,11 +534,9 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
        // TODO: Deal with *flags.
-        if (!parse_options(vol, opt)) {
+        if (!parse_options(vol, opt))
-                unlock_kernel();
                return -EINVAL;
-        }
-        unlock_kernel();
        ntfs_debug("Done.");
        return 0;
 }
@@ -2261,8 +2249,6 @@ static void ntfs_put_super(struct super_block *sb)
        ntfs_debug("Entering.");
-        lock_kernel();
 #ifdef NTFS_RW
        /*
         * Commit all inodes while they are still open in case some of them
@@ -2433,8 +2419,6 @@ static void ntfs_put_super(struct super_block *sb)
        sb->s_fs_info = NULL;
        kfree(vol);
-        unlock_kernel();
 }
 /**
@@ -2772,8 +2756,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
        init_rwsem(&vol->mftbmp_lock);
        init_rwsem(&vol->lcnbmp_lock);
-        unlock_kernel();
        /* By default, enable sparse support. */
        NVolSetSparseEnabled(vol);
@@ -2929,8 +2911,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
                goto unl_upcase_iput_tmp_ino_err_out_now;
        }
        if ((sb->s_root = d_alloc_root(vol->root_ino))) {
-                /* We increment i_count simulating an ntfs_iget(). */
+                /* We grab a reference, simulating an ntfs_iget(). */
-                atomic_inc(&vol->root_ino->i_count);
+                ihold(vol->root_ino);
                ntfs_debug("Exiting, status successful.");
                /* Release the default upcase if it has no users. */
                mutex_lock(&ntfs_lock);
@@ -2940,7 +2922,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
                }
                mutex_unlock(&ntfs_lock);
                sb->s_export_op = &ntfs_export_ops;
-                lock_kernel();
                lockdep_on();
                return 0;
        }
@@ -3040,24 +3021,8 @@ iput_tmp_ino_err_out_now:
        if (vol->mft_ino && vol->mft_ino != tmp_ino)
                iput(vol->mft_ino);
        vol->mft_ino = NULL;
-        /*
-         * This is needed to get ntfs_clear_extent_inode() called for each
-         * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
-         * leak resources and B) a subsequent mount fails automatically due to
-         * ntfs_iget() never calling down into our ntfs_read_locked_inode()
-         * method again... FIXME: Do we need to do this twice now because of
-         * attribute inodes? I think not, so leave as is for now... (AIA)
-         */
-        if (invalidate_inodes(sb)) {
-                ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
-                                "driver bug.");
-                /* Copied from fs/super.c. I just love this message. (-; */
-                printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
-                                "seconds.  Have a nice day...\n");
-        }
        /* Errors at this stage are irrelevant. */
 err_out_now:
-        lock_kernel();
        sb->s_fs_info = NULL;
        kfree(vol);
        ntfs_debug("Failed, returning -EINVAL.");
@@ -3094,17 +3059,16 @@ struct kmem_cache *ntfs_index_ctx_cache;
 /* Driver wide mutex. */
 DEFINE_MUTEX(ntfs_lock);
-static int ntfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *ntfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
-                           mnt);
 }
 static struct file_system_type ntfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ntfs",
-        .get_sb         = ntfs_get_sb,
+        .mount          = ntfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 5cfeee118158..f1e962cb3b73 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -165,7 +165,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
         * ocfs2 never allocates in this function - the only time we
         * need to use BH_New is when we're extending i_size on a file
         * system which doesn't support holes, in which case BH_New
-         * allows block_prepare_write() to zero.
+         * allows __block_write_begin() to zero.
         *
         * If we see this on a sparse file system, then a truncate has
         * raced us and removed the cluster. In this case, we clear
@@ -407,21 +407,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
        return ret;
 }
-/*
- * This is called from ocfs2_write_zero_page() which has handled it's
- * own cluster locking and has ensured allocation exists for those
- * blocks to be written.
- */
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
-                               unsigned from, unsigned to)
-{
-        int ret;
-        ret = block_prepare_write(page, from, to, ocfs2_get_block);
-        return ret;
-}
 /* Taken from ext3. We don't necessarily need the full blown
 * functionality yet, but IMHO it's better to cut and paste the whole
 * thing so we can avoid introducing our own bugs (and easily pick up
@@ -732,7 +717,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
 }
 /*
- * Some of this taken from block_prepare_write(). We already have our
+ * Some of this taken from __block_write_begin(). We already have our
 * mapping by now though, and the entire write will be allocating or
 * it won't, so not much need to use BH_New.
 *
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 7606f663da6d..76bfdfda691a 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,9 +22,6 @@
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
-                               unsigned from, unsigned to);
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                         struct page *page,
                                                         unsigned from,
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 96fa7ebc530c..15fdbdf9eb4b 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -129,7 +129,7 @@ struct o2net_node {
 struct o2net_sock_container {
        struct kref             sc_kref;
-        /* the next two are vaild for the life time of the sc */
+        /* the next two are valid for the life time of the sc */
        struct socket           *sc_sock;
        struct o2nm_node        *sc_node;
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index c2903b84bb7a..b2df490a19ed 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -400,6 +400,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
        if (inode) {
                ip = DLMFS_I(inode);
+                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
@@ -425,6 +426,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
        if (!inode)
                return NULL;
+        inode->i_ino = get_next_ino();
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
@@ -612,6 +614,7 @@ static const struct file_operations dlmfs_file_operations = {
        .poll           = dlmfs_file_poll,
        .read           = dlmfs_file_read,
        .write          = dlmfs_file_write,
+        .llseek         = default_llseek,
 };
 static const struct inode_operations dlmfs_dir_inode_operations = {
@@ -640,16 +643,16 @@ static const struct inode_operations dlmfs_file_inode_operations = {
        .setattr        = dlmfs_file_setattr,
 };
-static int dlmfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
 }
 static struct file_system_type dlmfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ocfs2_dlmfs",
-        .get_sb         = dlmfs_get_sb,
+        .mount          = dlmfs_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9e8cc4346b76..77b4c04a2809 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -187,8 +187,7 @@ static int ocfs2_sync_file(struct file *file, int datasync)
                 * platter
                 */
                if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
-                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-                                           NULL, BLKDEV_IFL_WAIT);
                goto bail;
        }
@@ -797,13 +796,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
                block_end = block_start + (1 << inode->i_blkbits);
                /*
-                 * block_start is block-aligned.  Bump it by one to
+                 * block_start is block-aligned.  Bump it by one to force
-                 * force ocfs2_{prepare,commit}_write() to zero the
+                 * __block_write_begin and block_commit_write to zero the
                 * whole block.
                 */
-                ret = ocfs2_prepare_write_nolock(inode, page,
+                ret = __block_write_begin(page, block_start + 1, 0,
-                                                 block_start + 1,
+                                          ocfs2_get_block);
-                                                 block_start + 1);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out_unlock;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index e7bde21149ae..ff5744e1e36f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -742,7 +742,7 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_commit;
        }
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d8408217e3bd..1efea3615589 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -159,7 +159,9 @@ struct ocfs2_lock_res {
        char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
        unsigned int             l_ro_holders;
        unsigned int             l_ex_holders;
-        unsigned char            l_level;
+        char                     l_level;
+        char                     l_requested;
+        char                     l_blocking;
        /* Data packed - type enum ocfs2_lock_type */
        unsigned char            l_type;
@@ -169,8 +171,6 @@ struct ocfs2_lock_res {
        unsigned char            l_action;
        /* Data packed - enum type ocfs2_unlock_action */
        unsigned char            l_unlock_action;
-        unsigned char            l_requested;
-        unsigned char            l_blocking;
        unsigned int             l_pending_gen;
        spinlock_t               l_lock;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2dc57bca0688..252e7c82f929 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -22,7 +22,6 @@
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
@@ -612,12 +611,10 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
                return -ENOMEM;
        p->op_this_node = -1;
-        lock_kernel();
        mutex_lock(&ocfs2_control_lock);
        file->private_data = p;
        list_add(&p->op_list, &ocfs2_control_private_list);
        mutex_unlock(&ocfs2_control_lock);
-        unlock_kernel();
        return 0;
 }
@@ -628,6 +625,7 @@ static const struct file_operations ocfs2_control_fops = {
        .read    = ocfs2_control_read,
        .write   = ocfs2_control_write,
        .owner   = THIS_MODULE,
+        .llseek  = default_llseek,
 };
 static struct miscdevice ocfs2_control_device = {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a8a0ca44f88f..f02c0ef31578 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -630,8 +630,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        u32 tmp;
-        lock_kernel();
        if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
            !ocfs2_check_set_options(sb, &parsed_options)) {
                ret = -EINVAL;
@@ -739,7 +737,6 @@ unlock_osb:
                                                        MS_POSIXACL : 0);
        }
 out:
-        unlock_kernel();
        return ret;
 }
@@ -1239,14 +1236,12 @@ read_super_error:
        return status;
 }
-static int ocfs2_get_sb(struct file_system_type *fs_type,
+static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
                        int flags,
                        const char *dev_name,
-                        void *data,
+                        void *data)
-                        struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
-                           mnt);
 }
 static void ocfs2_kill_sb(struct super_block *sb)
@@ -1270,8 +1265,7 @@ out:
 static struct file_system_type ocfs2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ocfs2",
-        .get_sb         = ocfs2_get_sb, /* is this called when we mount
+        .mount          = ocfs2_mount,
-                                        * the fs? */
        .kill_sb        = ocfs2_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
@@ -1696,13 +1690,9 @@ static void ocfs2_put_super(struct super_block *sb)
 {
        mlog_entry("(0x%p)\n", sb);
-        lock_kernel();
        ocfs2_sync_blockdev(sb);
        ocfs2_dismount_volume(sb, 0);
-        unlock_kernel();
        mlog_exit_void();
 }
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 14a22863291a..e043c4cb9a97 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -557,17 +557,16 @@ end:
        return ret;
 }
-static int omfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *omfs_mount(struct file_system_type *fs_type,
-                        int flags, const char *dev_name,
+                        int flags, const char *dev_name, void *data)
-                        void *data, struct vfsmount *m)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m);
+        return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super);
 }
 static struct file_system_type omfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "omfs",
-        .get_sb = omfs_get_sb,
+        .mount = omfs_mount,
        .kill_sb = kill_block_super,
        .fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/open.c b/fs/open.c
index d74e1983e8dc..4197b9ed023d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -786,11 +786,11 @@ struct file *nameidata_to_filp(struct nameidata *nd)
        /* Pick up the filp from the open intent */
        filp = nd->intent.open.file;
        /* Has the filesystem initialised the file for us? */
-        if (filp->f_path.dentry == NULL)
+        if (filp->f_path.dentry == NULL) {
+                path_get(&nd->path);
                filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
                                     NULL, cred);
-        else
+        }
-                path_put(&nd->path);
        return filp;
 }
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ffcd04f0012c..911e61f348fc 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -415,16 +415,16 @@ out_no_root:
        return ret;
 }
-static int openprom_get_sb(struct file_system_type *fs_type,
+static struct dentry *openprom_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt);
+        return mount_single(fs_type, flags, data, openprom_fill_super);
 }
 static struct file_system_type openprom_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "openpromfs",
-        .get_sb         = openprom_get_sb,
+        .mount          = openprom_mount,
        .kill_sb        = kill_anon_super,
 };
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 79fbf3f390f0..0a8b0ad0c7e2 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -352,6 +352,7 @@ static void part_release(struct device *dev)
 {
        struct hd_struct *p = dev_to_part(dev);
        free_part_stats(p);
+        free_part_info(p);
        kfree(p);
 }
@@ -401,7 +402,8 @@ static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
                   whole_disk_show, NULL);
 struct hd_struct *add_partition(struct gendisk *disk, int partno,
-                                sector_t start, sector_t len, int flags)
+                                sector_t start, sector_t len, int flags,
+                                struct partition_meta_info *info)
 {
        struct hd_struct *p;
        dev_t devt = MKDEV(0, 0);
@@ -438,6 +440,14 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        p->partno = partno;
        p->policy = get_disk_ro(disk);
+        if (info) {
+                struct partition_meta_info *pinfo = alloc_part_info(disk);
+                if (!pinfo)
+                        goto out_free_stats;
+                memcpy(pinfo, info, sizeof(*info));
+                p->info = pinfo;
+        }
        dname = dev_name(ddev);
        if (isdigit(dname[strlen(dname) - 1]))
                dev_set_name(pdev, "%sp%d", dname, partno);
@@ -451,7 +461,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        err = blk_alloc_devt(p, &devt);
        if (err)
-                goto out_free_stats;
+                goto out_free_info;
        pdev->devt = devt;
        /* delay uevent until 'holders' subdir is created */
@@ -481,6 +491,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        return p;
+out_free_info:
+        free_part_info(p);
 out_free_stats:
        free_part_stats(p);
 out_free:
@@ -513,14 +525,14 @@ void register_disk(struct gendisk *disk)
        if (device_add(ddev))
                return;
-#ifndef CONFIG_SYSFS_DEPRECATED
+        if (!sysfs_deprecated) {
-        err = sysfs_create_link(block_depr, &ddev->kobj,
+                err = sysfs_create_link(block_depr, &ddev->kobj,
-                                kobject_name(&ddev->kobj));
+                                        kobject_name(&ddev->kobj));
-        if (err) {
+                if (err) {
-                device_del(ddev);
+                        device_del(ddev);
-                return;
+                        return;
+                }
        }
-#endif
        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
@@ -642,6 +654,7 @@ rescan:
        /* add partitions */
        for (p = 1; p < state->limit; p++) {
                sector_t size, from;
+                struct partition_meta_info *info = NULL;
                size = state->parts[p].size;
                if (!size)
@@ -675,8 +688,12 @@ rescan:
                                size = get_capacity(disk) - from;
                        }
                }
+                if (state->parts[p].has_info)
+                        info = &state->parts[p].info;
                part = add_partition(disk, p, from, size,
-                                     state->parts[p].flags);
+                                     state->parts[p].flags,
+                                     &state->parts[p].info);
                if (IS_ERR(part)) {
                        printk(KERN_ERR " %s: p%d could not be added: %ld\n",
                               disk->disk_name, p, -PTR_ERR(part));
@@ -737,8 +754,7 @@ void del_gendisk(struct gendisk *disk)
        kobject_put(disk->part0.holder_dir);
        kobject_put(disk->slave_dir);
        disk->driverfs_dev = NULL;
-#ifndef CONFIG_SYSFS_DEPRECATED
+        if (!sysfs_deprecated)
-        sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
+                sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
-#endif
        device_del(disk_to_dev(disk));
 }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 8e4e103ba216..d68bf4dc3bc2 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -1,5 +1,6 @@
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
+#include <linux/genhd.h>
 /*
 * add_gd_partition adds a partitions details to the devices partition
@@ -12,6 +13,8 @@ struct parsed_partitions {
                sector_t from;
                sector_t size;
                int flags;
+                bool has_info;
+                struct partition_meta_info info;
        } parts[DISK_MAX_PARTS];
        int next;
        int limit;
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index dbb44d4bb8a7..ac0ccb5026a2 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -94,6 +94,7 @@
 *
 ************************************************************/
 #include <linux/crc32.h>
+#include <linux/ctype.h>
 #include <linux/math64.h>
 #include <linux/slab.h>
 #include "check.h"
@@ -604,6 +605,7 @@ int efi_partition(struct parsed_partitions *state)
        gpt_entry *ptes = NULL;
        u32 i;
        unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
+        u8 unparsed_guid[37];
        if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
                kfree(gpt);
@@ -614,6 +616,9 @@ int efi_partition(struct parsed_partitions *state)
        pr_debug("GUID Partition Table is valid!  Yea!\n");
        for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+                struct partition_meta_info *info;
+                unsigned label_count = 0;
+                unsigned label_max;
                u64 start = le64_to_cpu(ptes[i].starting_lba);
                u64 size = le64_to_cpu(ptes[i].ending_lba) -
                           le64_to_cpu(ptes[i].starting_lba) + 1ULL;
@@ -627,6 +632,26 @@ int efi_partition(struct parsed_partitions *state)
                if (!efi_guidcmp(ptes[i].partition_type_guid,
                                 PARTITION_LINUX_RAID_GUID))
                        state->parts[i + 1].flags = ADDPART_FLAG_RAID;
+                info = &state->parts[i + 1].info;
+                /* Instead of doing a manual swap to big endian, reuse the
+                 * common ASCII hex format as the interim.
+                 */
+                efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
+                part_pack_uuid(unparsed_guid, info->uuid);
+                /* Naively convert UTF16-LE to 7 bits. */
+                label_max = min(sizeof(info->volname) - 1,
+                                sizeof(ptes[i].partition_name));
+                info->volname[label_max] = 0;
+                while (label_count < label_max) {
+                        u8 c = ptes[i].partition_name[label_count] & 0xff;
+                        if (c && !isprint(c))
+                                c = '!';
+                        info->volname[label_count] = c;
+                        label_count++;
+                }
+                state->parts[i + 1].has_info = true;
        }
        kfree(ptes);
        kfree(gpt);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 5bf8a04b5d9b..789c625c7aa5 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -5,7 +5,7 @@
 * Copyright (c) 2001-2007 Anton Altaparmakov
 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
 *
- * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/
+ * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index d1fb50b28d86..374242c0971a 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -5,7 +5,7 @@
 * Copyright (c) 2001-2007 Anton Altaparmakov
 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
 *
- * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/
+ * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
diff --git a/fs/pipe.c b/fs/pipe.c
index 279eef96c51c..a8012a955720 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -382,7 +382,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
                        error = ops->confirm(pipe, buf);
                        if (error) {
                                if (!ret)
-                                        error = ret;
+                                        ret = error;
                                break;
                        }
@@ -954,6 +954,8 @@ static struct inode * get_pipe_inode(void)
        if (!inode)
                goto fail_inode;
+        inode->i_ino = get_next_ino();
        pipe = alloc_pipe_info(inode);
        if (!pipe)
                goto fail_iput;
@@ -1245,16 +1247,15 @@ out:
 * any operations on the root directory. However, we need a non-trivial
 * d_name - pipe: will go nicely and kill the special-casing in procfs.
 */
-static int pipefs_get_sb(struct file_system_type *fs_type,
+static struct dentry *pipefs_mount(struct file_system_type *fs_type,
-                         int flags, const char *dev_name, void *data,
+                         int flags, const char *dev_name, void *data)
-                         struct vfsmount *mnt)
 {
-        return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
+        return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
 }
 static struct file_system_type pipe_fs_type = {
        .name           = "pipefs",
-        .get_sb         = pipefs_get_sb,
+        .mount          = pipefs_mount,
        .kill_sb        = kill_anon_super,
 };
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 50f8f0600f06..6a0068841d96 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -33,8 +33,8 @@ config PROC_KCORE
        depends on PROC_FS && MMU
 config PROC_VMCORE
-        bool "/proc/vmcore support (EXPERIMENTAL)"
+        bool "/proc/vmcore support"
-        depends on PROC_FS && CRASH_DUMP
+        depends on PROC_FS && CRASH_DUMP
        default y
        help
        Exports the dump image of crashed kernel in ELF format.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8e4addaa5424..f3d02ca461ec 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -226,7 +226,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 {
        struct mm_struct *mm;
-        if (mutex_lock_killable(&task->cred_guard_mutex))
+        if (mutex_lock_killable(&task->signal->cred_guard_mutex))
                return NULL;
        mm = get_task_mm(task);
@@ -235,7 +235,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
                mmput(mm);
                mm = NULL;
        }
-        mutex_unlock(&task->cred_guard_mutex);
+        mutex_unlock(&task->signal->cred_guard_mutex);
        return mm;
 }
@@ -771,6 +771,8 @@ static const struct file_operations proc_single_file_operations = {
 static int mem_open(struct inode* inode, struct file* file)
 {
        file->private_data = (void*)((long)current->self_exec_id);
+        /* OK to pass negative loff_t, we can catch out-of-range */
+        file->f_mode |= FMODE_UNSIGNED_OFFSET;
        return 0;
 }
@@ -1023,28 +1025,47 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
-        if (copy_from_user(buffer, buf, count))
+        if (copy_from_user(buffer, buf, count)) {
-                return -EFAULT;
+                err = -EFAULT;
+                goto out;
+        }
        err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
        if (err)
-                return -EINVAL;
+                goto out;
        if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
-             oom_adjust != OOM_DISABLE)
+             oom_adjust != OOM_DISABLE) {
-                return -EINVAL;
+                err = -EINVAL;
+                goto out;
+        }
        task = get_proc_task(file->f_path.dentry->d_inode);
-        if (!task)
+        if (!task) {
-                return -ESRCH;
+                err = -ESRCH;
+                goto out;
+        }
+        task_lock(task);
+        if (!task->mm) {
+                err = -EINVAL;
+                goto err_task_lock;
+        }
        if (!lock_task_sighand(task, &flags)) {
-                put_task_struct(task);
+                err = -ESRCH;
-                return -ESRCH;
+                goto err_task_lock;
        }
        if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
-                unlock_task_sighand(task, &flags);
+                err = -EACCES;
-                put_task_struct(task);
+                goto err_sighand;
-                return -EACCES;
+        }
+        if (oom_adjust != task->signal->oom_adj) {
+                if (oom_adjust == OOM_DISABLE)
+                        atomic_inc(&task->mm->oom_disable_count);
+                if (task->signal->oom_adj == OOM_DISABLE)
+                        atomic_dec(&task->mm->oom_disable_count);
        }
        /*
@@ -1065,10 +1086,13 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        else
                task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
                                                                -OOM_DISABLE;
+err_sighand:
        unlock_task_sighand(task, &flags);
+err_task_lock:
+        task_unlock(task);
        put_task_struct(task);
+out:
-        return count;
+        return err < 0 ? err : count;
 }
 static const struct file_operations proc_oom_adjust_operations = {
@@ -1109,30 +1133,49 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
-        if (copy_from_user(buffer, buf, count))
+        if (copy_from_user(buffer, buf, count)) {
-                return -EFAULT;
+                err = -EFAULT;
+                goto out;
+        }
        err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
        if (err)
-                return -EINVAL;
+                goto out;
        if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
-                        oom_score_adj > OOM_SCORE_ADJ_MAX)
+                        oom_score_adj > OOM_SCORE_ADJ_MAX) {
-                return -EINVAL;
+                err = -EINVAL;
+                goto out;
+        }
        task = get_proc_task(file->f_path.dentry->d_inode);
-        if (!task)
+        if (!task) {
-                return -ESRCH;
+                err = -ESRCH;
+                goto out;
+        }
+        task_lock(task);
+        if (!task->mm) {
+                err = -EINVAL;
+                goto err_task_lock;
+        }
        if (!lock_task_sighand(task, &flags)) {
-                put_task_struct(task);
+                err = -ESRCH;
-                return -ESRCH;
+                goto err_task_lock;
        }
        if (oom_score_adj < task->signal->oom_score_adj &&
                        !capable(CAP_SYS_RESOURCE)) {
-                unlock_task_sighand(task, &flags);
+                err = -EACCES;
-                put_task_struct(task);
+                goto err_sighand;
-                return -EACCES;
        }
+        if (oom_score_adj != task->signal->oom_score_adj) {
+                if (oom_score_adj == OOM_SCORE_ADJ_MIN)
+                        atomic_inc(&task->mm->oom_disable_count);
+                if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                        atomic_dec(&task->mm->oom_disable_count);
+        }
        task->signal->oom_score_adj = oom_score_adj;
        /*
         * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
@@ -1143,14 +1186,19 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        else
                task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
                                                        OOM_SCORE_ADJ_MAX;
+err_sighand:
        unlock_task_sighand(task, &flags);
+err_task_lock:
+        task_unlock(task);
        put_task_struct(task);
-        return count;
+out:
+        return err < 0 ? err : count;
 }
 static const struct file_operations proc_oom_score_adj_operations = {
        .read           = oom_score_adj_read,
        .write          = oom_score_adj_write,
+        .llseek         = default_llseek,
 };
 #ifdef CONFIG_AUDITSYSCALL
@@ -1600,6 +1648,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
        /* Common stuff */
        ei = PROC_I(inode);
+        inode->i_ino = get_next_ino();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_op = &proc_def_inode_operations;
@@ -2039,11 +2088,13 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
 static const struct file_operations proc_fdinfo_file_operations = {
        .open           = nonseekable_open,
        .read           = proc_fdinfo_read,
+        .llseek         = no_llseek,
 };
 static const struct file_operations proc_fd_operations = {
        .read           = generic_read_dir,
        .readdir        = proc_readfd,
+        .llseek         = default_llseek,
 };
 /*
@@ -2112,6 +2163,7 @@ static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
 static const struct file_operations proc_fdinfo_operations = {
        .read           = generic_read_dir,
        .readdir        = proc_readfdinfo,
+        .llseek         = default_llseek,
 };
 /*
@@ -2302,14 +2354,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
                goto out_free;
        /* Guard against adverse ptrace interaction */
-        length = mutex_lock_interruptible(&task->cred_guard_mutex);
+        length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
        if (length < 0)
                goto out_free;
        length = security_setprocattr(task,
                                      (char*)file->f_path.dentry->d_name.name,
                                      (void*)page, count);
-        mutex_unlock(&task->cred_guard_mutex);
+        mutex_unlock(&task->signal->cred_guard_mutex);
 out_free:
        free_page((unsigned long) page);
 out:
@@ -2343,6 +2395,7 @@ static int proc_attr_dir_readdir(struct file * filp,
 static const struct file_operations proc_attr_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = proc_attr_dir_readdir,
+        .llseek         = default_llseek,
 };
 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
@@ -2542,6 +2595,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        /* Initialize the inode */
        ei = PROC_I(inode);
+        inode->i_ino = get_next_ino();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        /*
@@ -2751,6 +2805,7 @@ static int proc_tgid_base_readdir(struct file * filp,
 static const struct file_operations proc_tgid_base_operations = {
        .read           = generic_read_dir,
        .readdir        = proc_tgid_base_readdir,
+        .llseek         = default_llseek,
 };
 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -3088,6 +3143,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
 static const struct file_operations proc_tid_base_operations = {
        .read           = generic_read_dir,
        .readdir        = proc_tid_base_readdir,
+        .llseek         = default_llseek,
 };
 static const struct inode_operations proc_tid_base_inode_operations = {
@@ -3324,4 +3380,5 @@ static const struct inode_operations proc_task_inode_operations = {
 static const struct file_operations proc_task_operations = {
        .read           = generic_read_dir,
        .readdir        = proc_task_readdir,
+        .llseek         = default_llseek,
 };
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5be436ea088e..b652cb00906b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -23,6 +23,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        if (!inode)
                goto out;
+        inode->i_ino = get_next_ino();
        sysctl_head_get(head);
        ei = PROC_I(inode);
        ei->sysctl = head;
@@ -364,6 +366,7 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 static const struct file_operations proc_sys_file_operations = {
        .read           = proc_sys_read,
        .write          = proc_sys_write,
+        .llseek         = default_llseek,
 };
 static const struct file_operations proc_sys_dir_file_operations = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4258384ed22d..ef9fa8e24ad6 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -35,8 +35,8 @@ static int proc_set_super(struct super_block *sb, void *data)
        return set_anon_super(sb, NULL);
 }
-static int proc_get_sb(struct file_system_type *fs_type,
+static struct dentry *proc_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
        int err;
        struct super_block *sb;
@@ -61,14 +61,14 @@ static int proc_get_sb(struct file_system_type *fs_type,
        sb = sget(fs_type, proc_test_super, proc_set_super, ns);
        if (IS_ERR(sb))
-                return PTR_ERR(sb);
+                return ERR_CAST(sb);
        if (!sb->s_root) {
                sb->s_flags = flags;
                err = proc_fill_super(sb);
                if (err) {
                        deactivate_locked_super(sb);
-                        return err;
+                        return ERR_PTR(err);
                }
                ei = PROC_I(sb->s_root->d_inode);
@@ -79,11 +79,9 @@ static int proc_get_sb(struct file_system_type *fs_type,
                }
                sb->s_flags |= MS_ACTIVE;
-                ns->proc_mnt = mnt;
        }
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 }
 static void proc_kill_sb(struct super_block *sb)
@@ -97,7 +95,7 @@ static void proc_kill_sb(struct super_block *sb)
 static struct file_system_type proc_fs_type = {
        .name           = "proc",
-        .get_sb         = proc_get_sb,
+        .mount          = proc_mount,
        .kill_sb        = proc_kill_sb,
 };
@@ -115,6 +113,7 @@ void __init proc_root_init(void)
                return;
        }
+        init_pid_ns.proc_mnt = proc_mnt;
        proc_symlink("mounts", NULL, "self/mounts");
        proc_net_init();
@@ -179,6 +178,7 @@ static int proc_root_readdir(struct file * filp,
 static const struct file_operations proc_root_operations = {
        .read            = generic_read_dir,
        .readdir         = proc_root_readdir,
+        .llseek         = default_llseek,
 };
 /*
@@ -212,6 +212,7 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
+        ns->proc_mnt = mnt;
        return 0;
 }
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 1807c2419f17..37994737c983 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,13 +10,13 @@ static int show_softirqs(struct seq_file *p, void *v)
 {
        int i, j;
-        seq_printf(p, "                ");
+        seq_printf(p, "                    ");
        for_each_possible_cpu(i)
                seq_printf(p, "CPU%-8d", i);
        seq_printf(p, "\n");
        for (i = 0; i < NR_SOFTIRQS; i++) {
-                seq_printf(p, "%8s:", softirq_to_name[i]);
+                seq_printf(p, "%12s:", softirq_to_name[i]);
                for_each_possible_cpu(j)
                        seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
                seq_printf(p, "\n");
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bf31b03fc275..e15a19c93bae 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -31,7 +31,6 @@ static int show_stat(struct seq_file *p, void *v)
        u64 sum_softirq = 0;
        unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
        struct timespec boottime;
-        unsigned int per_irq_sum;
        user = nice = system = idle = iowait =
                irq = softirq = steal = cputime64_zero;
@@ -52,9 +51,7 @@ static int show_stat(struct seq_file *p, void *v)
                guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
                guest_nice = cputime64_add(guest_nice,
                        kstat_cpu(i).cpustat.guest_nice);
-                for_each_irq_nr(j) {
+                sum += kstat_cpu_irqs_sum(i);
-                        sum += kstat_irqs_cpu(j, i);
-                }
                sum += arch_irq_stat_cpu(i);
                for (j = 0; j < NR_SOFTIRQS; j++) {
@@ -110,13 +107,8 @@ static int show_stat(struct seq_file *p, void *v)
        seq_printf(p, "intr %llu", (unsigned long long)sum);
        /* sum again ? it could be updated? */
-        for_each_irq_nr(j) {
+        for_each_irq_nr(j)
-                per_irq_sum = 0;
+                seq_printf(p, " %u", kstat_irqs(j));
-                for_each_possible_cpu(i)
-                        per_irq_sum += kstat_irqs_cpu(j, i);
-                seq_printf(p, " %u", per_irq_sum);
-        }
        seq_printf(p,
                "\nctxt %llu\n"
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 1dbca4e8cc16..da6b01d70f01 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -327,6 +327,7 @@ struct mem_size_stats {
        unsigned long private_clean;
        unsigned long private_dirty;
        unsigned long referenced;
+        unsigned long anonymous;
        unsigned long swap;
        u64 pss;
 };
@@ -357,6 +358,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                if (!page)
                        continue;
+                if (PageAnon(page))
+                        mss->anonymous += PAGE_SIZE;
                mss->resident += PAGE_SIZE;
                /* Accumulate the size in pages that have been accessed. */
                if (pte_young(ptent) || PageReferenced(page))
@@ -410,6 +414,7 @@ static int show_smap(struct seq_file *m, void *v)
                   "Private_Clean:  %8lu kB\n"
                   "Private_Dirty:  %8lu kB\n"
                   "Referenced:     %8lu kB\n"
+                   "Anonymous:      %8lu kB\n"
                   "Swap:           %8lu kB\n"
                   "KernelPageSize: %8lu kB\n"
                   "MMUPageSize:    %8lu kB\n",
@@ -421,6 +426,7 @@ static int show_smap(struct seq_file *m, void *v)
                   mss.private_clean >> 10,
                   mss.private_dirty >> 10,
                   mss.referenced >> 10,
+                   mss.anonymous >> 10,
                   mss.swap >> 10,
                   vma_kernel_pagesize(vma) >> 10,
                   vma_mmu_pagesize(vma) >> 10);
@@ -539,6 +545,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 const struct file_operations proc_clear_refs_operations = {
        .write          = clear_refs_write,
+        .llseek         = noop_llseek,
 };
 struct pagemapread {
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6e8fc62b40a8..7b0329468a5d 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,7 +11,6 @@
 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
 */
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "qnx4.h"
@@ -29,8 +28,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
        QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
        QNX4DEBUG((KERN_INFO "filp->f_pos         = %ld\n", (long) filp->f_pos));
-        lock_kernel();
        while (filp->f_pos < inode->i_size) {
                blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
                bh = sb_bread(inode->i_sb, blknum);
@@ -71,7 +68,6 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
                brelse(bh);
        }
 out:
-        unlock_kernel();
        return 0;
 }
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 16829722be93..fcada42f1aa3 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -16,7 +16,6 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/highuid.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
@@ -157,8 +156,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct super_block *sb = dentry->d_sb;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
-        lock_kernel();
        buf->f_type    = sb->s_magic;
        buf->f_bsize   = sb->s_blocksize;
        buf->f_blocks  = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8;
@@ -168,8 +165,6 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
-        unlock_kernel();
        return 0;
 }
@@ -283,7 +278,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
                goto outi;
        brelse(bh);
        return 0;
      outi:
@@ -460,17 +454,16 @@ static void destroy_inodecache(void)
        kmem_cache_destroy(qnx4_inode_cachep);
 }
-static int qnx4_get_sb(struct file_system_type *fs_type,
+static struct dentry *qnx4_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
-                           mnt);
 }
 static struct file_system_type qnx4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "qnx4",
-        .get_sb         = qnx4_get_sb,
+        .mount          = qnx4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 58703ebba879..275327b5615e 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,7 +12,6 @@
 * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
 */
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "qnx4.h"
@@ -109,7 +108,6 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
        int len = dentry->d_name.len;
        struct inode *foundinode = NULL;
-        lock_kernel();
        if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino)))
                goto out;
        /* The entry is linked, let's get the real info */
@@ -123,13 +121,11 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
        foundinode = qnx4_iget(dir->i_sb, ino);
        if (IS_ERR(foundinode)) {
-                unlock_kernel();
                QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
                           PTR_ERR(foundinode)));
                return ERR_CAST(foundinode);
        }
 out:
-        unlock_kernel();
        d_add(dentry, foundinode);
        return NULL;
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 3e21b1e2ad3a..880fd9884366 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -4,6 +4,7 @@
 config QUOTA
        bool "Quota support"
+        select QUOTACTL
        help
          If you say Y here, you will be able to set per user limits for disk
          usage (also called disk quotas). Currently, it works for the
@@ -65,8 +66,7 @@ config QFMT_V2
 config QUOTACTL
        bool
-        depends on XFS_QUOTA || QUOTA
+        default n
-        default y
 config QUOTACTL_COMPAT
        bool
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index aad1316a977f..0fed41e6efcd 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1386,6 +1386,9 @@ static void __dquot_initialize(struct inode *inode, int type)
                /* Avoid races with quotaoff() */
                if (!sb_has_quota_active(sb, cnt))
                        continue;
+                /* We could race with quotaon or dqget() could have failed */
+                if (!got[cnt])
+                        continue;
                if (!inode->i_dquot[cnt]) {
                        inode->i_dquot[cnt] = got[cnt];
                        got[cnt] = NULL;
@@ -1736,6 +1739,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        qsize_t rsv_space = 0;
        struct dquot *transfer_from[MAXQUOTAS] = {};
        int cnt, ret = 0;
+        char is_valid[MAXQUOTAS] = {};
        char warntype_to[MAXQUOTAS];
        char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
@@ -1757,8 +1761,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        space = cur_space + rsv_space;
        /* Build the transfer_from list and check the limits */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                /*
+                 * Skip changes for same uid or gid or for turned off quota-type.
+                 */
                if (!transfer_to[cnt])
                        continue;
+                /* Avoid races with quotaoff() */
+                if (!sb_has_quota_active(inode->i_sb, cnt))
+                        continue;
+                is_valid[cnt] = 1;
                transfer_from[cnt] = inode->i_dquot[cnt];
                ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
                if (ret)
@@ -1772,12 +1783,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
         * Finally perform the needed transfer from transfer_from to transfer_to
         */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                /*
+                if (!is_valid[cnt])
-                 * Skip changes for same uid or gid or for turned off quota-type.
-                 */
-                if (!transfer_to[cnt])
                        continue;
                /* Due to IO error we might not have transfer_from[] structure */
                if (transfer_from[cnt]) {
                        warntype_from_inodes[cnt] =
@@ -1801,18 +1808,19 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        mark_all_dquot_dirty(transfer_from);
        mark_all_dquot_dirty(transfer_to);
-        /* Pass back references to put */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                transfer_to[cnt] = transfer_from[cnt];
-warn:
        flush_warnings(transfer_to, warntype_to);
        flush_warnings(transfer_from, warntype_from_inodes);
        flush_warnings(transfer_from, warntype_from_space);
-        return ret;
+        /* Pass back references to put */
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                if (is_valid[cnt])
+                        transfer_to[cnt] = transfer_from[cnt];
+        return 0;
 over_quota:
        spin_unlock(&dq_data_lock);
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        goto warn;
+        flush_warnings(transfer_to, warntype_to);
+        return ret;
 }
 EXPORT_SYMBOL(__dquot_transfer);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a5ebae70dc6d..eacb166fb259 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,6 +58,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
        struct inode * inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode_init_owner(inode, dir, mode);
                inode->i_mapping->a_ops = &ramfs_aops;
                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
@@ -254,17 +255,16 @@ fail:
        return err;
 }
-int ramfs_get_sb(struct file_system_type *fs_type,
+struct dentry *ramfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, ramfs_fill_super);
 }
-static int rootfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *rootfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super,
+        return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
-                            mnt);
 }
 static void ramfs_kill_sb(struct super_block *sb)
@@ -275,12 +275,12 @@ static void ramfs_kill_sb(struct super_block *sb)
 static struct file_system_type ramfs_fs_type = {
        .name           = "ramfs",
-        .get_sb         = ramfs_get_sb,
+        .mount          = ramfs_mount,
        .kill_sb        = ramfs_kill_sb,
 };
 static struct file_system_type rootfs_fs_type = {
        .name           = "rootfs",
-        .get_sb         = rootfs_get_sb,
+        .mount          = rootfs_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/fs/read_write.c b/fs/read_write.c
index 74e36586e4d3..431a0ed610c8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,6 +31,20 @@ const struct file_operations generic_ro_fops = {
 EXPORT_SYMBOL(generic_ro_fops);
+static int
+__negative_fpos_check(struct file *file, loff_t pos, size_t count)
+{
+        /*
+         * pos or pos+count is negative here, check overflow.
+         * too big "count" will be caught in rw_verify_area().
+         */
+        if ((pos < 0) && (pos + count < pos))
+                return -EOVERFLOW;
+        if (file->f_mode & FMODE_UNSIGNED_OFFSET)
+                return 0;
+        return -EINVAL;
+}
 /**
 * generic_file_llseek_unlocked - lockless generic llseek implementation
 * @file:       file structure to seek on
@@ -62,7 +76,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
                break;
        }
-        if (offset < 0 || offset > inode->i_sb->s_maxbytes)
+        if (offset < 0 && __negative_fpos_check(file, offset, 0))
+                return -EINVAL;
+        if (offset > inode->i_sb->s_maxbytes)
                return -EINVAL;
        /* Special lock needed here? */
@@ -124,7 +140,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 {
        loff_t retval;
-        lock_kernel();
+        mutex_lock(&file->f_dentry->d_inode->i_mutex);
        switch (origin) {
                case SEEK_END:
                        offset += i_size_read(file->f_path.dentry->d_inode);
@@ -137,7 +153,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                        offset += file->f_pos;
        }
        retval = -EINVAL;
-        if (offset >= 0) {
+        if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
@@ -145,7 +161,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                retval = offset;
        }
 out:
-        unlock_kernel();
+        mutex_unlock(&file->f_dentry->d_inode->i_mutex);
        return retval;
 }
 EXPORT_SYMBOL(default_llseek);
@@ -156,7 +172,6 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
        fn = no_llseek;
        if (file->f_mode & FMODE_LSEEK) {
-                fn = default_llseek;
                if (file->f_op && file->f_op->llseek)
                        fn = file->f_op->llseek;
        }
@@ -222,13 +237,12 @@ bad:
 }
 #endif
 /*
 * rw_verify_area doesn't like huge counts. We limit
 * them to something that fits in "int" so that others
 * won't have to do range checks all the time.
 */
-#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
 int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
 {
        struct inode *inode;
@@ -239,8 +253,11 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
        if (unlikely((ssize_t) count < 0))
                return retval;
        pos = *ppos;
-        if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
+        if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
-                return retval;
+                retval = __negative_fpos_check(file, pos, count);
+                if (retval)
+                        return retval;
+        }
        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
                retval = locks_mandatory_area(
@@ -565,65 +582,71 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
                              unsigned long nr_segs, unsigned long fast_segs,
                              struct iovec *fast_pointer,
                              struct iovec **ret_pointer)
-  {
+{
        unsigned long seg;
-        ssize_t ret;
+        ssize_t ret;
        struct iovec *iov = fast_pointer;
-        /*
+        /*
-         * SuS says "The readv() function *may* fail if the iovcnt argument
+         * SuS says "The readv() function *may* fail if the iovcnt argument
-         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
-         * traditionally returned zero for zero segments, so...
+         * traditionally returned zero for zero segments, so...
-         */
+         */
        if (nr_segs == 0) {
                ret = 0;
-                goto out;
+                goto out;
        }
-        /*
+        /*
-         * First get the "struct iovec" from user memory and
+         * First get the "struct iovec" from user memory and
-         * verify all the pointers
+         * verify all the pointers
-         */
+         */
        if (nr_segs > UIO_MAXIOV) {
                ret = -EINVAL;
-                goto out;
+                goto out;
        }
        if (nr_segs > fast_segs) {
-                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
                if (iov == NULL) {
                        ret = -ENOMEM;
-                        goto out;
+                        goto out;
                }
-        }
+        }
        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
                ret = -EFAULT;
-                goto out;
+                goto out;
        }
-        /*
+        /*
         * According to the Single Unix Specification we should return EINVAL
         * if an element length is < 0 when cast to ssize_t or if the
         * total length would overflow the ssize_t return value of the
         * system call.
-         */
+         *
+         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
+         * overflow case.
+         */
        ret = 0;
-        for (seg = 0; seg < nr_segs; seg++) {
+        for (seg = 0; seg < nr_segs; seg++) {
-                void __user *buf = iov[seg].iov_base;
+                void __user *buf = iov[seg].iov_base;
-                ssize_t len = (ssize_t)iov[seg].iov_len;
+                ssize_t len = (ssize_t)iov[seg].iov_len;
                /* see if we we're about to use an invalid len or if
                 * it's about to overflow ssize_t */
-                if (len < 0 || (ret + len < ret)) {
+                if (len < 0) {
                        ret = -EINVAL;
-                        goto out;
+                        goto out;
                }
                if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
                        ret = -EFAULT;
-                        goto out;
+                        goto out;
+                }
+                if (len > MAX_RW_COUNT - ret) {
+                        len = MAX_RW_COUNT - ret;
+                        iov[seg].iov_len = len;
                }
                ret += len;
-        }
+        }
 out:
        *ret_pointer = iov;
        return ret;
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 513f431038f9..7cd46666ba2c 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -10,7 +10,8 @@ config REISERFS_FS
          In general, ReiserFS is as fast as ext2, but is very efficient with
          large directories and small files.  Additional patches are needed
-          for NFS and quotas, please see <http://www.namesys.com/> for links.
+          for NFS and quotas, please see 
+          <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
          It is more easily extended to have features currently found in
          database and keyword search systems than block allocation based file
@@ -18,7 +19,8 @@ config REISERFS_FS
          plugins consistent with our motto ``It takes more than a license to
          make source code open.''
-          Read <http://www.namesys.com/> to learn more about reiserfs.
+          Read <https://reiser4.wiki.kernel.org/index.php/Main_Page> 
+          to learn more about reiserfs.
          Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
index 14e8c9d460e5..e2f7a264e3ff 100644
--- a/fs/reiserfs/README
+++ b/fs/reiserfs/README
@@ -43,7 +43,7 @@ to address the fair crediting issue in the next GPL version.)
 [END LICENSING]
 Reiserfs is a file system based on balanced tree algorithms, which is
-described at http://devlinux.com/namesys.
+described at https://reiser4.wiki.kernel.org/index.php/Main_Page 
 Stop reading here.  Go there, then return.
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6846371498b6..91f080cc76c8 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -152,8 +152,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync)
        barrier_done = reiserfs_commit_for_inode(inode);
        reiserfs_write_unlock(inode->i_sb);
        if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-                        BLKDEV_IFL_WAIT);
        if (barrier_done < 0)
                return barrier_done;
        return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index caa758377d66..41656d40dc5c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -22,8 +22,6 @@
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-                           unsigned from, unsigned to);
 void reiserfs_evict_inode(struct inode *inode)
 {
@@ -165,7 +163,7 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
 ** but tail is still sitting in a direct item, and we can't write to
 ** it.  So, look through this page, and check all the mapped buffers
 ** to make sure they have valid block numbers.  Any that don't need
-** to be unmapped, so that block_prepare_write will correctly call
+** to be unmapped, so that __block_write_begin will correctly call
 ** reiserfs_get_block to convert the tail into an unformatted node
 */
 static inline void fix_tail_page_for_writing(struct page *page)
@@ -439,13 +437,13 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
 }
 /* special version of get_block that is only used by grab_tail_page right
-** now.  It is sent to block_prepare_write, and when you try to get a
+** now.  It is sent to __block_write_begin, and when you try to get a
 ** block past the end of the file (or a block from a hole) it returns
-** -ENOENT instead of a valid buffer.  block_prepare_write expects to
+** -ENOENT instead of a valid buffer.  __block_write_begin expects to
 ** be able to do i/o on the buffers returned, unless an error value
 ** is also returned.
 **
-** So, this allows block_prepare_write to be used for reading a single block
+** So, this allows __block_write_begin to be used for reading a single block
 ** in a page.  Where it does not produce a valid page for holes, or past the
 ** end of the file.  This turns out to be exactly what we need for reading
 ** tails for conversion.
@@ -558,11 +556,12 @@ static int convert_tail_for_hole(struct inode *inode,
         **
         ** We must fix the tail page for writing because it might have buffers
         ** that are mapped, but have a block number of 0.  This indicates tail
-         ** data that has been read directly into the page, and block_prepare_write
+         ** data that has been read directly into the page, and
-         ** won't trigger a get_block in this case.
+         ** __block_write_begin won't trigger a get_block in this case.
         */
        fix_tail_page_for_writing(tail_page);
-        retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
+        retval = __reiserfs_write_begin(tail_page, tail_start,
+                                      tail_end - tail_start);
        if (retval)
                goto unlock;
@@ -2033,7 +2032,7 @@ static int grab_tail_page(struct inode *inode,
        /* start within the page of the last block in the file */
        start = (offset / blocksize) * blocksize;
-        error = block_prepare_write(page, start, offset,
+        error = __block_write_begin(page, start, offset - start,
                                    reiserfs_get_block_create_0);
        if (error)
                goto unlock;
@@ -2438,7 +2437,7 @@ static int reiserfs_write_full_page(struct page *page,
                /* from this point on, we know the buffer is mapped to a
                 * real block and not a direct item
                 */
-                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else {
                        if (!trylock_buffer(bh)) {
@@ -2628,8 +2627,7 @@ static int reiserfs_write_begin(struct file *file,
        return ret;
 }
-int reiserfs_prepare_write(struct file *f, struct page *page,
+int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
-                           unsigned from, unsigned to)
 {
        struct inode *inode = page->mapping->host;
        int ret;
@@ -2650,7 +2648,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
                th->t_refcount++;
        }
-        ret = block_prepare_write(page, from, to, reiserfs_get_block);
+        ret = __block_write_begin(page, from, len, reiserfs_get_block);
        if (ret && reiserfs_transaction_running(inode->i_sb)) {
                struct reiserfs_transaction_handle *th = current->journal_info;
                /* this gets a little ugly.  If reiserfs_get_block returned an
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 5cbb81e134ac..adf22b485cea 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -160,8 +160,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-                           unsigned from, unsigned to);
 /*
 ** reiserfs_unpack
 ** Function try to convert tail from direct item into indirect.
@@ -200,7 +198,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
        }
        /* we unpack by finding the page with the tail, and calling
-         ** reiserfs_prepare_write on that page.  This will force a
+         ** __reiserfs_write_begin on that page.  This will force a
         ** reiserfs_get_block to unpack the tail for us.
         */
        index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -210,7 +208,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
        if (!page) {
                goto out;
        }
-        retval = reiserfs_prepare_write(NULL, page, write_from, write_from);
+        retval = __reiserfs_write_begin(page, write_from, 0);
        if (retval)
                goto out_unlock;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 812e2c05aa29..076c8b194682 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -138,13 +138,6 @@ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
        return 0;
 }
-static void disable_barrier(struct super_block *s)
-{
-        REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
-        printk("reiserfs: disabling flush barriers on %s\n",
-               reiserfs_bdevname(s));
-}
 static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
                                                         *sb)
 {
@@ -677,30 +670,6 @@ static void submit_ordered_buffer(struct buffer_head *bh)
        submit_bh(WRITE, bh);
 }
-static int submit_barrier_buffer(struct buffer_head *bh)
-{
-        get_bh(bh);
-        bh->b_end_io = reiserfs_end_ordered_io;
-        clear_buffer_dirty(bh);
-        if (!buffer_uptodate(bh))
-                BUG();
-        return submit_bh(WRITE_BARRIER, bh);
-}
-static void check_barrier_completion(struct super_block *s,
-                                     struct buffer_head *bh)
-{
-        if (buffer_eopnotsupp(bh)) {
-                clear_buffer_eopnotsupp(bh);
-                disable_barrier(s);
-                set_buffer_uptodate(bh);
-                set_buffer_dirty(bh);
-                reiserfs_write_unlock(s);
-                sync_dirty_buffer(bh);
-                reiserfs_write_lock(s);
-        }
-}
 #define CHUNK_SIZE 32
 struct buffer_chunk {
        struct buffer_head *bh[CHUNK_SIZE];
@@ -1009,7 +978,6 @@ static int flush_commit_list(struct super_block *s,
        struct buffer_head *tbh = NULL;
        unsigned int trans_id = jl->j_trans_id;
        struct reiserfs_journal *journal = SB_JOURNAL(s);
-        int barrier = 0;
        int retval = 0;
        int write_len;
@@ -1094,24 +1062,6 @@ static int flush_commit_list(struct super_block *s,
        }
        atomic_dec(&journal->j_async_throttle);
-        /* We're skipping the commit if there's an error */
-        if (retval || reiserfs_is_journal_aborted(journal))
-                barrier = 0;
-        /* wait on everything written so far before writing the commit
-         * if we are in barrier mode, send the commit down now
-         */
-        barrier = reiserfs_barrier_flush(s);
-        if (barrier) {
-                int ret;
-                lock_buffer(jl->j_commit_bh);
-                ret = submit_barrier_buffer(jl->j_commit_bh);
-                if (ret == -EOPNOTSUPP) {
-                        set_buffer_uptodate(jl->j_commit_bh);
-                        disable_barrier(s);
-                        barrier = 0;
-                }
-        }
        for (i = 0; i < (jl->j_len + 1); i++) {
                bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
                    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
@@ -1143,27 +1093,22 @@ static int flush_commit_list(struct super_block *s,
        BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
-        if (!barrier) {
+        /* If there was a write error in the journal - we can't commit
-                /* If there was a write error in the journal - we can't commit
+         * this transaction - it will be invalid and, if successful,
-                 * this transaction - it will be invalid and, if successful,
+         * will just end up propagating the write error out to
-                 * will just end up propagating the write error out to
+         * the file system. */
-                 * the file system. */
+        if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
-                if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
+                if (buffer_dirty(jl->j_commit_bh))
-                        if (buffer_dirty(jl->j_commit_bh))
+                        BUG();
-                                BUG();
+                mark_buffer_dirty(jl->j_commit_bh) ;
-                        mark_buffer_dirty(jl->j_commit_bh) ;
-                        reiserfs_write_unlock(s);
-                        sync_dirty_buffer(jl->j_commit_bh) ;
-                        reiserfs_write_lock(s);
-                }
-        } else {
                reiserfs_write_unlock(s);
-                wait_on_buffer(jl->j_commit_bh);
+                if (reiserfs_barrier_flush(s))
+                        __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
+                else
+                        sync_dirty_buffer(jl->j_commit_bh);
                reiserfs_write_lock(s);
        }
-        check_barrier_completion(s, jl->j_commit_bh);
        /* If there was a write error in the journal - we can't commit this
         * transaction - it will be invalid and, if successful, will just end
         * up propagating the write error out to the filesystem. */
@@ -1319,26 +1264,15 @@ static int _update_journal_header_block(struct super_block *sb,
                jh->j_first_unflushed_offset = cpu_to_le32(offset);
                jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
-                if (reiserfs_barrier_flush(sb)) {
+                set_buffer_dirty(journal->j_header_bh);
-                        int ret;
+                reiserfs_write_unlock(sb);
-                        lock_buffer(journal->j_header_bh);
-                        ret = submit_barrier_buffer(journal->j_header_bh);
+                if (reiserfs_barrier_flush(sb))
-                        if (ret == -EOPNOTSUPP) {
+                        __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
-                                set_buffer_uptodate(journal->j_header_bh);
+                else
-                                disable_barrier(sb);
-                                goto sync;
-                        }
-                        reiserfs_write_unlock(sb);
-                        wait_on_buffer(journal->j_header_bh);
-                        reiserfs_write_lock(sb);
-                        check_barrier_completion(sb, journal->j_header_bh);
-                } else {
-                      sync:
-                        set_buffer_dirty(journal->j_header_bh);
-                        reiserfs_write_unlock(sb);
                        sync_dirty_buffer(journal->j_header_bh);
-                        reiserfs_write_lock(sb);
-                }
+                reiserfs_write_lock(sb);
                if (!buffer_uptodate(journal->j_header_bh)) {
                        reiserfs_warning(sb, "journal-837",
                                         "IO error during journal replay");
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ee78d4a0086a..ba5f51ec3458 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1156,7 +1156,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        reiserfs_update_sd(&th, inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(dentry, inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
        reiserfs_write_unlock(dir->i_sb);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index e15ff612002d..3bf7a6457f4d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2213,12 +2213,11 @@ out:
 #endif
-static int get_super_block(struct file_system_type *fs_type,
+static struct dentry *get_super_block(struct file_system_type *fs_type,
                           int flags, const char *dev_name,
-                           void *data, struct vfsmount *mnt)
+                           void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
-                           mnt);
 }
 static int __init init_reiserfs_fs(void)
@@ -2253,7 +2252,7 @@ static void __exit exit_reiserfs_fs(void)
 struct file_system_type reiserfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "reiserfs",
-        .get_sb = get_super_block,
+        .mount = get_super_block,
        .kill_sb = reiserfs_kill_sb,
        .fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8c4cf273c672..5d04a7828e7a 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -418,13 +418,11 @@ static inline __u32 xattr_hash(const char *msg, int len)
 int reiserfs_commit_write(struct file *f, struct page *page,
                          unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
-                           unsigned from, unsigned to);
 static void update_ctime(struct inode *inode)
 {
        struct timespec now = current_fs_time(inode->i_sb);
-        if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink ||
+        if (inode_unhashed(inode) || !inode->i_nlink ||
            timespec_equal(&inode->i_ctime, &now))
                return;
@@ -532,8 +530,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
                        rxh->h_hash = cpu_to_le32(xahash);
                }
-                err = reiserfs_prepare_write(NULL, page, page_offset,
+                err = __reiserfs_write_begin(page, page_offset, chunk + skip);
-                                            page_offset + chunk + skip);
                if (!err) {
                        if (buffer)
                                memcpy(data + skip, buffer + buffer_pos, chunk);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 42d213546894..6647f90e55cd 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -282,6 +282,7 @@ error:
 static const struct file_operations romfs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = romfs_readdir,
+        .llseek         = default_llseek,
 };
 static const struct inode_operations romfs_dir_inode_operations = {
@@ -551,20 +552,19 @@ error_rsb:
 /*
 * get a superblock for mounting
 */
-static int romfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *romfs_mount(struct file_system_type *fs_type,
                        int flags, const char *dev_name,
-                        void *data, struct vfsmount *mnt)
+                        void *data)
 {
-        int ret = -EINVAL;
+        struct dentry *ret = ERR_PTR(-EINVAL);
 #ifdef CONFIG_ROMFS_ON_MTD
-        ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
+        ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super);
-                         mnt);
 #endif
 #ifdef CONFIG_ROMFS_ON_BLOCK
-        if (ret == -EINVAL)
+        if (ret == ERR_PTR(-EINVAL))
-                ret = get_sb_bdev(fs_type, flags, dev_name, data,
+                ret = mount_bdev(fs_type, flags, dev_name, data,
-                                  romfs_fill_super, mnt);
+                                  romfs_fill_super);
 #endif
        return ret;
 }
@@ -591,7 +591,7 @@ static void romfs_kill_sb(struct super_block *sb)
 static struct file_system_type romfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "romfs",
-        .get_sb         = romfs_get_sb,
+        .mount          = romfs_mount,
        .kill_sb        = romfs_kill_sb,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/select.c b/fs/select.c
index 500a669f7790..b7b10aa30861 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -67,7 +67,7 @@ static long __estimate_accuracy(struct timespec *tv)
        return slack;
 }
-static long estimate_accuracy(struct timespec *tv)
+long select_estimate_accuracy(struct timespec *tv)
 {
        unsigned long ret;
        struct timespec now;
@@ -417,7 +417,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
        }
        if (end_time && !timed_out)
-                slack = estimate_accuracy(end_time);
+                slack = select_estimate_accuracy(end_time);
        retval = 0;
        for (;;) {
@@ -769,7 +769,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
        }
        if (end_time && !timed_out)
-                slack = estimate_accuracy(end_time);
+                slack = select_estimate_accuracy(end_time);
        for (;;) {
                struct poll_list *walk;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index e1f437be6c3c..05d6b0e78c95 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -131,7 +131,7 @@ Eoverflow:
 */
 ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 {
-        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct seq_file *m = file->private_data;
        size_t copied = 0;
        loff_t pos;
        size_t n;
@@ -280,7 +280,7 @@ EXPORT_SYMBOL(seq_read);
 */
 loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 {
-        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct seq_file *m = file->private_data;
        loff_t retval = -EINVAL;
        mutex_lock(&m->lock);
@@ -324,7 +324,7 @@ EXPORT_SYMBOL(seq_lseek);
 */
 int seq_release(struct inode *inode, struct file *file)
 {
-        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct seq_file *m = file->private_data;
        kfree(m->buf);
        kfree(m);
        return 0;
@@ -462,9 +462,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
        if (size) {
                char *p;
-                spin_lock(&dcache_lock);
                p = __d_path(path, root, buf, size);
-                spin_unlock(&dcache_lock);
                res = PTR_ERR(p);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1c5a6add779d..492465b451dd 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -99,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 #ifdef __ARCH_SI_TRAPNO
                err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
 #endif
+#ifdef BUS_MCEERR_AO
+                /* 
+                 * Other callers might not initialize the si_lsb field,
+                 * so check explicitly for the right codes here.
+                 */
+                if (kinfo->si_code == BUS_MCEERR_AR ||
+                    kinfo->si_code == BUS_MCEERR_AO)
+                        err |= __put_user((short) kinfo->si_addr_lsb,
+                                          &uinfo->ssi_addr_lsb);
+#endif
                break;
        case __SI_CHLD:
                err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
@@ -206,6 +216,7 @@ static const struct file_operations signalfd_fops = {
        .release        = signalfd_release,
        .poll           = signalfd_poll,
        .read           = signalfd_read,
+        .llseek         = noop_llseek,
 };
 SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
deleted file mode 100644
index e668127c8b2e..000000000000
--- a/fs/smbfs/Kconfig
+++ /dev/null
@@ -1,55 +0,0 @@
-config SMB_FS
-        tristate "SMB file system support (OBSOLETE, please use CIFS)"
-        depends on INET
-        select NLS
-        help
-          SMB (Server Message Block) is the protocol Windows for Workgroups
-          (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
-          files and printers over local networks.  Saying Y here allows you to
-          mount their file systems (often called "shares" in this context) and
-          access them just like any other Unix directory.  Currently, this
-          works only if the Windows machines use TCP/IP as the underlying
-          transport protocol, and not NetBEUI.  For details, read
-          <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
-          available from <http://www.tldp.org/docs.html#howto>.
-          Note: if you just want your box to act as an SMB *server* and make
-          files and printing services available to Windows clients (which need
-          to have a TCP/IP stack), you don't need to say Y here; you can use
-          the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
-          for that.
-          General information about how to connect Linux, Windows machines and
-          Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-          To compile the SMB support as a module, choose M here:
-          the module will be called smbfs.  Most people say N, however.
-config SMB_NLS_DEFAULT
-        bool "Use a default NLS"
-        depends on SMB_FS
-        help
-          Enabling this will make smbfs use nls translations by default. You
-          need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
-          settings and you need to give the default nls for the SMB server as
-          CONFIG_SMB_NLS_REMOTE.
-          The nls settings can be changed at mount time, if your smbmount
-          supports that, using the codepage and iocharset parameters.
-          smbmount from samba 2.2.0 or later supports this.
-config SMB_NLS_REMOTE
-        string "Default Remote NLS Option"
-        depends on SMB_NLS_DEFAULT
-        default "cp437"
-        help
-          This setting allows you to specify a default value for which
-          codepage the server uses. If this field is left blank no
-          translations will be done by default. The local codepage/charset
-          default to CONFIG_NLS_DEFAULT.
-          The nls settings can be changed at mount time, if your smbmount
-          supports that, using the codepage and iocharset parameters.
-          smbmount from samba 2.2.0 or later supports this.
diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
deleted file mode 100644
index 4faf8c4722c3..000000000000
--- a/fs/smbfs/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-#
-# Makefile for the linux smb-filesystem routines.
-#
-obj-$(CONFIG_SMB_FS) += smbfs.o
-smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \
-                symlink.o smbiod.o request.o
-# If you want debugging output, you may add these flags to the EXTRA_CFLAGS
-# SMBFS_PARANOIA should normally be enabled.
-EXTRA_CFLAGS += -DSMBFS_PARANOIA
-#EXTRA_CFLAGS += -DSMBFS_DEBUG
-#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE
-#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
-#EXTRA_CFLAGS += -Werror
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
deleted file mode 100644
index 8c177eb7e344..000000000000
--- a/fs/smbfs/cache.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- *  cache.c
- *
- * Copyright (C) 1997 by Bill Hawes
- *
- * Routines to support directory cacheing using the page cache.
- * This cache code is almost directly taken from ncpfs.
- *
- * Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/smb_fs.h>
-#include <linux/pagemap.h>
-#include <linux/net.h>
-#include <asm/page.h>
-#include "smb_debug.h"
-#include "proto.h"
-/*
- * Force the next attempt to use the cache to be a timeout.
- * If we can't find the page that's fine, it will cause a refresh.
- */
-void
-smb_invalid_dir_cache(struct inode * dir)
-{
-        struct smb_sb_info *server = server_from_inode(dir);
-        union  smb_dir_cache *cache = NULL;
-        struct page *page = NULL;
-        page = grab_cache_page(&dir->i_data, 0);
-        if (!page)
-                goto out;
-        if (!PageUptodate(page))
-                goto out_unlock;
-        cache = kmap(page);
-        cache->head.time = jiffies - SMB_MAX_AGE(server);
-        kunmap(page);
-        SetPageUptodate(page);
-out_unlock:
-        unlock_page(page);
-        page_cache_release(page);
-out:
-        return;
-}
-/*
- * Mark all dentries for 'parent' as invalid, forcing them to be re-read
- */
-void
-smb_invalidate_dircache_entries(struct dentry *parent)
-{
-        struct smb_sb_info *server = server_from_dentry(parent);
-        struct list_head *next;
-        struct dentry *dentry;
-        spin_lock(&dcache_lock);
-        next = parent->d_subdirs.next;
-        while (next != &parent->d_subdirs) {
-                dentry = list_entry(next, struct dentry, d_u.d_child);
-                dentry->d_fsdata = NULL;
-                smb_age_dentry(server, dentry);
-                next = next->next;
-        }
-        spin_unlock(&dcache_lock);
-}
-/*
- * dget, but require that fpos and parent matches what the dentry contains.
- * dentry is not known to be a valid pointer at entry.
- */
-struct dentry *
-smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
-{
-        struct dentry *dent = dentry;
-        struct list_head *next;
-        if (d_validate(dent, parent)) {
-                if (dent->d_name.len <= SMB_MAXNAMELEN &&
-                    (unsigned long)dent->d_fsdata == fpos) {
-                        if (!dent->d_inode) {
-                                dput(dent);
-                                dent = NULL;
-                        }
-                        return dent;
-                }
-                dput(dent);
-        }
-        /* If a pointer is invalid, we search the dentry. */
-        spin_lock(&dcache_lock);
-        next = parent->d_subdirs.next;
-        while (next != &parent->d_subdirs) {
-                dent = list_entry(next, struct dentry, d_u.d_child);
-                if ((unsigned long)dent->d_fsdata == fpos) {
-                        if (dent->d_inode)
-                                dget_locked(dent);
-                        else
-                                dent = NULL;
-                        goto out_unlock;
-                }
-                next = next->next;
-        }
-        dent = NULL;
-out_unlock:
-        spin_unlock(&dcache_lock);
-        return dent;
-}
-/*
- * Create dentry/inode for this file and add it to the dircache.
- */
-int
-smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-               struct smb_cache_control *ctrl, struct qstr *qname,
-               struct smb_fattr *entry)
-{
-        struct dentry *newdent, *dentry = filp->f_path.dentry;
-        struct inode *newino, *inode = dentry->d_inode;
-        struct smb_cache_control ctl = *ctrl;
-        int valid = 0;
-        int hashed = 0;
-        ino_t ino = 0;
-        qname->hash = full_name_hash(qname->name, qname->len);
-        if (dentry->d_op && dentry->d_op->d_hash)
-                if (dentry->d_op->d_hash(dentry, qname) != 0)
-                        goto end_advance;
-        newdent = d_lookup(dentry, qname);
-        if (!newdent) {
-                newdent = d_alloc(dentry, qname);
-                if (!newdent)
-                        goto end_advance;
-        } else {
-                hashed = 1;
-                memcpy((char *) newdent->d_name.name, qname->name,
-                       newdent->d_name.len);
-        }
-        if (!newdent->d_inode) {
-                smb_renew_times(newdent);
-                entry->f_ino = iunique(inode->i_sb, 2);
-                newino = smb_iget(inode->i_sb, entry);
-                if (newino) {
-                        smb_new_dentry(newdent);
-                        d_instantiate(newdent, newino);
-                        if (!hashed)
-                                d_rehash(newdent);
-                }
-        } else
-                smb_set_inode_attr(newdent->d_inode, entry);
-        if (newdent->d_inode) {
-                ino = newdent->d_inode->i_ino;
-                newdent->d_fsdata = (void *) ctl.fpos;
-                smb_new_dentry(newdent);
-        }
-        if (ctl.idx >= SMB_DIRCACHE_SIZE) {
-                if (ctl.page) {
-                        kunmap(ctl.page);
-                        SetPageUptodate(ctl.page);
-                        unlock_page(ctl.page);
-                        page_cache_release(ctl.page);
-                }
-                ctl.cache = NULL;
-                ctl.idx  -= SMB_DIRCACHE_SIZE;
-                ctl.ofs  += 1;
-                ctl.page  = grab_cache_page(&inode->i_data, ctl.ofs);
-                if (ctl.page)
-                        ctl.cache = kmap(ctl.page);
-        }
-        if (ctl.cache) {
-                ctl.cache->dentry[ctl.idx] = newdent;
-                valid = 1;
-        }
-        dput(newdent);
-end_advance:
-        if (!valid)
-                ctl.valid = 0;
-        if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
-                if (!ino)
-                        ino = find_inode_number(dentry, qname);
-                if (!ino)
-                        ino = iunique(inode->i_sb, 2);
-                ctl.filled = filldir(dirent, qname->name, qname->len,
-                                     filp->f_pos, ino, DT_UNKNOWN);
-                if (!ctl.filled)
-                        filp->f_pos += 1;
-        }
-        ctl.fpos += 1;
-        ctl.idx  += 1;
-        *ctrl = ctl;
-        return (ctl.valid || !ctl.filled);
-}
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
deleted file mode 100644
index 00a70cab1f36..000000000000
--- a/fs/smbfs/dir.c
+++ /dev/null
@@ -1,702 +0,0 @@
-/*
- *  dir.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/smp_lock.h>
-#include <linux/ctype.h>
-#include <linux/net.h>
-#include <linux/sched.h>
-#include <linux/smb_fs.h>
-#include <linux/smb_mount.h>
-#include <linux/smbno.h>
-#include "smb_debug.h"
-#include "proto.h"
-static int smb_readdir(struct file *, void *, filldir_t);
-static int smb_dir_open(struct inode *, struct file *);
-static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *);
-static int smb_create(struct inode *, struct dentry *, int, struct nameidata *);
-static int smb_mkdir(struct inode *, struct dentry *, int);
-static int smb_rmdir(struct inode *, struct dentry *);
-static int smb_unlink(struct inode *, struct dentry *);
-static int smb_rename(struct inode *, struct dentry *,
-                      struct inode *, struct dentry *);
-static int smb_make_node(struct inode *,struct dentry *,int,dev_t);
-static int smb_link(struct dentry *, struct inode *, struct dentry *);
-const struct file_operations smb_dir_operations =
-{
-        .llseek         = generic_file_llseek,
-        .read           = generic_read_dir,
-        .readdir        = smb_readdir,
-        .unlocked_ioctl = smb_ioctl,
-        .open           = smb_dir_open,
-};
-const struct inode_operations smb_dir_inode_operations =
-{
-        .create         = smb_create,
-        .lookup         = smb_lookup,
-        .unlink         = smb_unlink,
-        .mkdir          = smb_mkdir,
-        .rmdir          = smb_rmdir,
-        .rename         = smb_rename,
-        .getattr        = smb_getattr,
-        .setattr        = smb_notify_change,
-};
-const struct inode_operations smb_dir_inode_operations_unix =
-{
-        .create         = smb_create,
-        .lookup         = smb_lookup,
-        .unlink         = smb_unlink,
-        .mkdir          = smb_mkdir,
-        .rmdir          = smb_rmdir,
-        .rename         = smb_rename,
-        .getattr        = smb_getattr,
-        .setattr        = smb_notify_change,
-        .symlink        = smb_symlink,
-        .mknod          = smb_make_node,
-        .link           = smb_link,
-};
-/*
- * Read a directory, using filldir to fill the dirent memory.
- * smb_proc_readdir does the actual reading from the smb server.
- *
- * The cache code is almost directly taken from ncpfs
- */
-static int 
-smb_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-        struct dentry *dentry = filp->f_path.dentry;
-        struct inode *dir = dentry->d_inode;
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        union  smb_dir_cache *cache = NULL;
-        struct smb_cache_control ctl;
-        struct page *page = NULL;
-        int result;
-        ctl.page  = NULL;
-        ctl.cache = NULL;
-        VERBOSE("reading %s/%s, f_pos=%d\n",
-                DENTRY_PATH(dentry),  (int) filp->f_pos);
-        result = 0;
-        lock_kernel();
-        switch ((unsigned int) filp->f_pos) {
-        case 0:
-                if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos = 1;
-                /* fallthrough */
-        case 1:
-                if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR) < 0)
-                        goto out;
-                filp->f_pos = 2;
-        }
-        /*
-         * Make sure our inode is up-to-date.
-         */
-        result = smb_revalidate_inode(dentry);
-        if (result)
-                goto out;
-        page = grab_cache_page(&dir->i_data, 0);
-        if (!page)
-                goto read_really;
-        ctl.cache = cache = kmap(page);
-        ctl.head  = cache->head;
-        if (!PageUptodate(page) || !ctl.head.eof) {
-                VERBOSE("%s/%s, page uptodate=%d, eof=%d\n",
-                         DENTRY_PATH(dentry), PageUptodate(page),ctl.head.eof);
-                goto init_cache;
-        }
-        if (filp->f_pos == 2) {
-                if (jiffies - ctl.head.time >= SMB_MAX_AGE(server))
-                        goto init_cache;
-                /*
-                 * N.B. ncpfs checks mtime of dentry too here, we don't.
-                 *   1. common smb servers do not update mtime on dir changes
-                 *   2. it requires an extra smb request
-                 *      (revalidate has the same timeout as ctl.head.time)
-                 *
-                 * Instead smbfs invalidates its own cache on local changes
-                 * and remote changes are not seen until timeout.
-                 */
-        }
-        if (filp->f_pos > ctl.head.end)
-                goto finished;
-        ctl.fpos = filp->f_pos + (SMB_DIRCACHE_START - 2);
-        ctl.ofs  = ctl.fpos / SMB_DIRCACHE_SIZE;
-        ctl.idx  = ctl.fpos % SMB_DIRCACHE_SIZE;
-        for (;;) {
-                if (ctl.ofs != 0) {
-                        ctl.page = find_lock_page(&dir->i_data, ctl.ofs);
-                        if (!ctl.page)
-                                goto invalid_cache;
-                        ctl.cache = kmap(ctl.page);
-                        if (!PageUptodate(ctl.page))
-                                goto invalid_cache;
-                }
-                while (ctl.idx < SMB_DIRCACHE_SIZE) {
-                        struct dentry *dent;
-                        int res;
-                        dent = smb_dget_fpos(ctl.cache->dentry[ctl.idx],
-                                             dentry, filp->f_pos);
-                        if (!dent)
-                                goto invalid_cache;
-                        res = filldir(dirent, dent->d_name.name,
-                                      dent->d_name.len, filp->f_pos,
-                                      dent->d_inode->i_ino, DT_UNKNOWN);
-                        dput(dent);
-                        if (res)
-                                goto finished;
-                        filp->f_pos += 1;
-                        ctl.idx += 1;
-                        if (filp->f_pos > ctl.head.end)
-                                goto finished;
-                }
-                if (ctl.page) {
-                        kunmap(ctl.page);
-                        SetPageUptodate(ctl.page);
-                        unlock_page(ctl.page);
-                        page_cache_release(ctl.page);
-                        ctl.page = NULL;
-                }
-                ctl.idx  = 0;
-                ctl.ofs += 1;
-        }
-invalid_cache:
-        if (ctl.page) {
-                kunmap(ctl.page);
-                unlock_page(ctl.page);
-                page_cache_release(ctl.page);
-                ctl.page = NULL;
-        }
-        ctl.cache = cache;
-init_cache:
-        smb_invalidate_dircache_entries(dentry);
-        ctl.head.time = jiffies;
-        ctl.head.eof = 0;
-        ctl.fpos = 2;
-        ctl.ofs = 0;
-        ctl.idx = SMB_DIRCACHE_START;
-        ctl.filled = 0;
-        ctl.valid  = 1;
-read_really:
-        result = server->ops->readdir(filp, dirent, filldir, &ctl);
-        if (result == -ERESTARTSYS && page)
-                ClearPageUptodate(page);
-        if (ctl.idx == -1)
-                goto invalid_cache;     /* retry */
-        ctl.head.end = ctl.fpos - 1;
-        ctl.head.eof = ctl.valid;
-finished:
-        if (page) {
-                cache->head = ctl.head;
-                kunmap(page);
-                if (result != -ERESTARTSYS)
-                        SetPageUptodate(page);
-                unlock_page(page);
-                page_cache_release(page);
-        }
-        if (ctl.page) {
-                kunmap(ctl.page);
-                SetPageUptodate(ctl.page);
-                unlock_page(ctl.page);
-                page_cache_release(ctl.page);
-        }
-out:
-        unlock_kernel();
-        return result;
-}
-static int
-smb_dir_open(struct inode *dir, struct file *file)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        struct smb_sb_info *server;
-        int error = 0;
-        VERBOSE("(%s/%s)\n", dentry->d_parent->d_name.name,
-                file->f_path.dentry->d_name.name);
-        /*
-         * Directory timestamps in the core protocol aren't updated
-         * when a file is added, so we give them a very short TTL.
-         */
-        lock_kernel();
-        server = server_from_dentry(dentry);
-        if (server->opt.protocol < SMB_PROTOCOL_LANMAN2) {
-                unsigned long age = jiffies - SMB_I(dir)->oldmtime;
-                if (age > 2*HZ)
-                        smb_invalid_dir_cache(dir);
-        }
-        /*
-         * Note: in order to allow the smbmount process to open the
-         * mount point, we only revalidate if the connection is valid or
-         * if the process is trying to access something other than the root.
-         */
-        if (server->state == CONN_VALID || !IS_ROOT(dentry))
-                error = smb_revalidate_inode(dentry);
-        unlock_kernel();
-        return error;
-}
-/*
- * Dentry operations routines
- */
-static int smb_lookup_validate(struct dentry *, struct nameidata *);
-static int smb_hash_dentry(struct dentry *, struct qstr *);
-static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
-static int smb_delete_dentry(struct dentry *);
-static const struct dentry_operations smbfs_dentry_operations =
-{
-        .d_revalidate   = smb_lookup_validate,
-        .d_hash         = smb_hash_dentry,
-        .d_compare      = smb_compare_dentry,
-        .d_delete       = smb_delete_dentry,
-};
-static const struct dentry_operations smbfs_dentry_operations_case =
-{
-        .d_revalidate   = smb_lookup_validate,
-        .d_delete       = smb_delete_dentry,
-};
-/*
- * This is the callback when the dcache has a lookup hit.
- */
-static int
-smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        struct inode * inode = dentry->d_inode;
-        unsigned long age = jiffies - dentry->d_time;
-        int valid;
-        /*
-         * The default validation is based on dentry age:
-         * we believe in dentries for a few seconds.  (But each
-         * successful server lookup renews the timestamp.)
-         */
-        valid = (age <= SMB_MAX_AGE(server));
-#ifdef SMBFS_DEBUG_VERBOSE
-        if (!valid)
-                VERBOSE("%s/%s not valid, age=%lu\n", 
-                        DENTRY_PATH(dentry), age);
-#endif
-        if (inode) {
-                lock_kernel();
-                if (is_bad_inode(inode)) {
-                        PARANOIA("%s/%s has dud inode\n", DENTRY_PATH(dentry));
-                        valid = 0;
-                } else if (!valid)
-                        valid = (smb_revalidate_inode(dentry) == 0);
-                unlock_kernel();
-        } else {
-                /*
-                 * What should we do for negative dentries?
-                 */
-        }
-        return valid;
-}
-static int 
-smb_hash_dentry(struct dentry *dir, struct qstr *this)
-{
-        unsigned long hash;
-        int i;
-        hash = init_name_hash();
-        for (i=0; i < this->len ; i++)
-                hash = partial_name_hash(tolower(this->name[i]), hash);
-        this->hash = end_name_hash(hash);
-  
-        return 0;
-}
-static int
-smb_compare_dentry(struct dentry *dir, struct qstr *a, struct qstr *b)
-{
-        int i, result = 1;
-        if (a->len != b->len)
-                goto out;
-        for (i=0; i < a->len; i++) {
-                if (tolower(a->name[i]) != tolower(b->name[i]))
-                        goto out;
-        }
-        result = 0;
-out:
-        return result;
-}
-/*
- * This is the callback from dput() when d_count is going to 0.
- * We use this to unhash dentries with bad inodes.
- */
-static int
-smb_delete_dentry(struct dentry * dentry)
-{
-        if (dentry->d_inode) {
-                if (is_bad_inode(dentry->d_inode)) {
-                        PARANOIA("bad inode, unhashing %s/%s\n",
-                                 DENTRY_PATH(dentry));
-                        return 1;
-                }
-        } else {
-                /* N.B. Unhash negative dentries? */
-        }
-        return 0;
-}
-/*
- * Initialize a new dentry
- */
-void
-smb_new_dentry(struct dentry *dentry)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        if (server->mnt->flags & SMB_MOUNT_CASE)
-                dentry->d_op = &smbfs_dentry_operations_case;
-        else
-                dentry->d_op = &smbfs_dentry_operations;
-        dentry->d_time = jiffies;
-}
-/*
- * Whenever a lookup succeeds, we know the parent directories
- * are all valid, so we want to update the dentry timestamps.
- * N.B. Move this to dcache?
- */
-void
-smb_renew_times(struct dentry * dentry)
-{
-        dget(dentry);
-        spin_lock(&dentry->d_lock);
-        for (;;) {
-                struct dentry *parent;
-                dentry->d_time = jiffies;
-                if (IS_ROOT(dentry))
-                        break;
-                parent = dentry->d_parent;
-                dget(parent);
-                spin_unlock(&dentry->d_lock);
-                dput(dentry);
-                dentry = parent;
-                spin_lock(&dentry->d_lock);
-        }
-        spin_unlock(&dentry->d_lock);
-        dput(dentry);
-}
-static struct dentry *
-smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        struct smb_fattr finfo;
-        struct inode *inode;
-        int error;
-        struct smb_sb_info *server;
-        error = -ENAMETOOLONG;
-        if (dentry->d_name.len > SMB_MAXNAMELEN)
-                goto out;
-        /* Do not allow lookup of names with backslashes in */
-        error = -EINVAL;
-        if (memchr(dentry->d_name.name, '\\', dentry->d_name.len))
-                goto out;
-        lock_kernel();
-        error = smb_proc_getattr(dentry, &finfo);
-#ifdef SMBFS_PARANOIA
-        if (error && error != -ENOENT)
-                PARANOIA("find %s/%s failed, error=%d\n",
-                         DENTRY_PATH(dentry), error);
-#endif
-        inode = NULL;
-        if (error == -ENOENT)
-                goto add_entry;
-        if (!error) {
-                error = -EACCES;
-                finfo.f_ino = iunique(dentry->d_sb, 2);
-                inode = smb_iget(dir->i_sb, &finfo);
-                if (inode) {
-        add_entry:
-                        server = server_from_dentry(dentry);
-                        if (server->mnt->flags & SMB_MOUNT_CASE)
-                                dentry->d_op = &smbfs_dentry_operations_case;
-                        else
-                                dentry->d_op = &smbfs_dentry_operations;
-                        d_add(dentry, inode);
-                        smb_renew_times(dentry);
-                        error = 0;
-                }
-        }
-        unlock_kernel();
-out:
-        return ERR_PTR(error);
-}
-/*
- * This code is common to all routines creating a new inode.
- */
-static int
-smb_instantiate(struct dentry *dentry, __u16 fileid, int have_id)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        struct inode *inode;
-        int error;
-        struct smb_fattr fattr;
-        VERBOSE("file %s/%s, fileid=%u\n", DENTRY_PATH(dentry), fileid);
-        error = smb_proc_getattr(dentry, &fattr);
-        if (error)
-                goto out_close;
-        smb_renew_times(dentry);
-        fattr.f_ino = iunique(dentry->d_sb, 2);
-        inode = smb_iget(dentry->d_sb, &fattr);
-        if (!inode)
-                goto out_no_inode;
-        if (have_id) {
-                struct smb_inode_info *ei = SMB_I(inode);
-                ei->fileid = fileid;
-                ei->access = SMB_O_RDWR;
-                ei->open = server->generation;
-        }
-        d_instantiate(dentry, inode);
-out:
-        return error;
-out_no_inode:
-        error = -EACCES;
-out_close:
-        if (have_id) {
-                PARANOIA("%s/%s failed, error=%d, closing %u\n",
-                         DENTRY_PATH(dentry), error, fileid);
-                smb_close_fileid(dentry, fileid);
-        }
-        goto out;
-}
-/* N.B. How should the mode argument be used? */
-static int
-smb_create(struct inode *dir, struct dentry *dentry, int mode,
-                struct nameidata *nd)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        __u16 fileid;
-        int error;
-        struct iattr attr;
-        VERBOSE("creating %s/%s, mode=%d\n", DENTRY_PATH(dentry), mode);
-        lock_kernel();
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_create(dentry, 0, get_seconds(), &fileid);
-        if (!error) {
-                if (server->opt.capabilities & SMB_CAP_UNIX) {
-                        /* Set attributes for new file */
-                        attr.ia_valid = ATTR_MODE;
-                        attr.ia_mode = mode;
-                        error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
-                }
-                error = smb_instantiate(dentry, fileid, 1);
-        } else {
-                PARANOIA("%s/%s failed, error=%d\n",
-                         DENTRY_PATH(dentry), error);
-        }
-        unlock_kernel();
-        return error;
-}
-/* N.B. How should the mode argument be used? */
-static int
-smb_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        int error;
-        struct iattr attr;
-        lock_kernel();
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_mkdir(dentry);
-        if (!error) {
-                if (server->opt.capabilities & SMB_CAP_UNIX) {
-                        /* Set attributes for new directory */
-                        attr.ia_valid = ATTR_MODE;
-                        attr.ia_mode = mode;
-                        error = smb_proc_setattr_unix(dentry, &attr, 0, 0);
-                }
-                error = smb_instantiate(dentry, 0, 0);
-        }
-        unlock_kernel();
-        return error;
-}
-static int
-smb_rmdir(struct inode *dir, struct dentry *dentry)
-{
-        struct inode *inode = dentry->d_inode;
-        int error;
-        /*
-         * Close the directory if it's open.
-         */
-        lock_kernel();
-        smb_close(inode);
-        /*
-         * Check that nobody else is using the directory..
-         */
-        error = -EBUSY;
-        if (!d_unhashed(dentry))
-                goto out;
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_rmdir(dentry);
-out:
-        unlock_kernel();
-        return error;
-}
-static int
-smb_unlink(struct inode *dir, struct dentry *dentry)
-{
-        int error;
-        /*
-         * Close the file if it's open.
-         */
-        lock_kernel();
-        smb_close(dentry->d_inode);
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_unlink(dentry);
-        if (!error)
-                smb_renew_times(dentry);
-        unlock_kernel();
-        return error;
-}
-static int
-smb_rename(struct inode *old_dir, struct dentry *old_dentry,
-           struct inode *new_dir, struct dentry *new_dentry)
-{
-        int error;
-        /*
-         * Close any open files, and check whether to delete the
-         * target before attempting the rename.
-         */
-        lock_kernel();
-        if (old_dentry->d_inode)
-                smb_close(old_dentry->d_inode);
-        if (new_dentry->d_inode) {
-                smb_close(new_dentry->d_inode);
-                error = smb_proc_unlink(new_dentry);
-                if (error) {
-                        VERBOSE("unlink %s/%s, error=%d\n",
-                                DENTRY_PATH(new_dentry), error);
-                        goto out;
-                }
-                /* FIXME */
-                d_delete(new_dentry);
-        }
-        smb_invalid_dir_cache(old_dir);
-        smb_invalid_dir_cache(new_dir);
-        error = smb_proc_mv(old_dentry, new_dentry);
-        if (!error) {
-                smb_renew_times(old_dentry);
-                smb_renew_times(new_dentry);
-        }
-out:
-        unlock_kernel();
-        return error;
-}
-/*
- * FIXME: samba servers won't let you create device nodes unless uid/gid
- * matches the connection credentials (and we don't know which those are ...)
- */
-static int
-smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
-{
-        int error;
-        struct iattr attr;
-        attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
-        attr.ia_mode = mode;
-        current_euid_egid(&attr.ia_uid, &attr.ia_gid);
-        if (!new_valid_dev(dev))
-                return -EINVAL;
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_setattr_unix(dentry, &attr, MAJOR(dev), MINOR(dev));
-        if (!error) {
-                error = smb_instantiate(dentry, 0, 0);
-        }
-        return error;
-}
-/*
- * dentry = existing file
- * new_dentry = new file
- */
-static int
-smb_link(struct dentry *dentry, struct inode *dir, struct dentry *new_dentry)
-{
-        int error;
-        DEBUG1("smb_link old=%s/%s new=%s/%s\n",
-               DENTRY_PATH(dentry), DENTRY_PATH(new_dentry));
-        smb_invalid_dir_cache(dir);
-        error = smb_proc_link(server_from_dentry(dentry), dentry, new_dentry);
-        if (!error) {
-                smb_renew_times(dentry);
-                error = smb_instantiate(new_dentry, 0, 0);
-        }
-        return error;
-}
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
deleted file mode 100644
index 8e187a0f94bb..000000000000
--- a/fs/smbfs/file.c
+++ /dev/null
@@ -1,454 +0,0 @@
-/*
- *  file.c
- *
- *  Copyright (C) 1995, 1996, 1997 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/aio.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/smbno.h>
-#include <linux/smb_fs.h>
-#include "smb_debug.h"
-#include "proto.h"
-static int
-smb_fsync(struct file *file, int datasync)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        int result;
-        VERBOSE("sync file %s/%s\n", DENTRY_PATH(dentry));
-        /*
-         * The VFS will writepage() all dirty pages for us, but we
-         * should send a SMBflush to the server, letting it know that
-         * we want things synchronized with actual storage.
-         *
-         * Note: this function requires all pages to have been written already
-         *       (should be ok with writepage_sync)
-         */
-        result = smb_proc_flush(server, SMB_I(dentry->d_inode)->fileid);
-        return result;
-}
-/*
- * Read a page synchronously.
- */
-static int
-smb_readpage_sync(struct dentry *dentry, struct page *page)
-{
-        char *buffer = kmap(page);
-        loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        unsigned int rsize = smb_get_rsize(server);
-        int count = PAGE_SIZE;
-        int result;
-        VERBOSE("file %s/%s, count=%d@%Ld, rsize=%d\n",
-                DENTRY_PATH(dentry), count, offset, rsize);
-        result = smb_open(dentry, SMB_O_RDONLY);
-        if (result < 0)
-                goto io_error;
-        do {
-                if (count < rsize)
-                        rsize = count;
-                result = server->ops->read(dentry->d_inode,offset,rsize,buffer);
-                if (result < 0)
-                        goto io_error;
-                count -= result;
-                offset += result;
-                buffer += result;
-                dentry->d_inode->i_atime =
-                        current_fs_time(dentry->d_inode->i_sb);
-                if (result < rsize)
-                        break;
-        } while (count);
-        memset(buffer, 0, count);
-        flush_dcache_page(page);
-        SetPageUptodate(page);
-        result = 0;
-io_error:
-        kunmap(page);
-        unlock_page(page);
-        return result;
-}
-/*
- * We are called with the page locked and we unlock it when done.
- */
-static int
-smb_readpage(struct file *file, struct page *page)
-{
-        int             error;
-        struct dentry  *dentry = file->f_path.dentry;
-        page_cache_get(page);
-        error = smb_readpage_sync(dentry, page);
-        page_cache_release(page);
-        return error;
-}
-/*
- * Write a page synchronously.
- * Offset is the data offset within the page.
- */
-static int
-smb_writepage_sync(struct inode *inode, struct page *page,
-                   unsigned long pageoffset, unsigned int count)
-{
-        loff_t offset;
-        char *buffer = kmap(page) + pageoffset;
-        struct smb_sb_info *server = server_from_inode(inode);
-        unsigned int wsize = smb_get_wsize(server);
-        int ret = 0;
-        offset = ((loff_t)page->index << PAGE_CACHE_SHIFT) + pageoffset;
-        VERBOSE("file ino=%ld, fileid=%d, count=%d@%Ld, wsize=%d\n",
-                inode->i_ino, SMB_I(inode)->fileid, count, offset, wsize);
-        do {
-                int write_ret;
-                if (count < wsize)
-                        wsize = count;
-                write_ret = server->ops->write(inode, offset, wsize, buffer);
-                if (write_ret < 0) {
-                        PARANOIA("failed write, wsize=%d, write_ret=%d\n",
-                                 wsize, write_ret);
-                        ret = write_ret;
-                        break;
-                }
-                /* N.B. what if result < wsize?? */
-#ifdef SMBFS_PARANOIA
-                if (write_ret < wsize)
-                        PARANOIA("short write, wsize=%d, write_ret=%d\n",
-                                 wsize, write_ret);
-#endif
-                buffer += wsize;
-                offset += wsize;
-                count -= wsize;
-                /*
-                 * Update the inode now rather than waiting for a refresh.
-                 */
-                inode->i_mtime = inode->i_atime = current_fs_time(inode->i_sb);
-                SMB_I(inode)->flags |= SMB_F_LOCALWRITE;
-                if (offset > inode->i_size)
-                        inode->i_size = offset;
-        } while (count);
-        kunmap(page);
-        return ret;
-}
-/*
- * Write a page to the server. This will be used for NFS swapping only
- * (for now), and we currently do this synchronously only.
- *
- * We are called with the page locked and we unlock it when done.
- */
-static int
-smb_writepage(struct page *page, struct writeback_control *wbc)
-{
-        struct address_space *mapping = page->mapping;
-        struct inode *inode;
-        unsigned long end_index;
-        unsigned offset = PAGE_CACHE_SIZE;
-        int err;
-        BUG_ON(!mapping);
-        inode = mapping->host;
-        BUG_ON(!inode);
-        end_index = inode->i_size >> PAGE_CACHE_SHIFT;
-        /* easy case */
-        if (page->index < end_index)
-                goto do_it;
-        /* things got complicated... */
-        offset = inode->i_size & (PAGE_CACHE_SIZE-1);
-        /* OK, are we completely out? */
-        if (page->index >= end_index+1 || !offset)
-                return 0; /* truncated - don't care */
-do_it:
-        page_cache_get(page);
-        err = smb_writepage_sync(inode, page, 0, offset);
-        SetPageUptodate(page);
-        unlock_page(page);
-        page_cache_release(page);
-        return err;
-}
-static int
-smb_updatepage(struct file *file, struct page *page, unsigned long offset,
-               unsigned int count)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
-                ((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
-        return smb_writepage_sync(dentry->d_inode, page, offset, count);
-}
-static ssize_t
-smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-                        unsigned long nr_segs, loff_t pos)
-{
-        struct file * file = iocb->ki_filp;
-        struct dentry * dentry = file->f_path.dentry;
-        ssize_t status;
-        VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
-                (unsigned long) iocb->ki_left, (unsigned long) pos);
-        status = smb_revalidate_inode(dentry);
-        if (status) {
-                PARANOIA("%s/%s validation failed, error=%Zd\n",
-                         DENTRY_PATH(dentry), status);
-                goto out;
-        }
-        VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n",
-                (long)dentry->d_inode->i_size,
-                dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec);
-        status = generic_file_aio_read(iocb, iov, nr_segs, pos);
-out:
-        return status;
-}
-static int
-smb_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
-        struct dentry * dentry = file->f_path.dentry;
-        int     status;
-        VERBOSE("file %s/%s, address %lu - %lu\n",
-                DENTRY_PATH(dentry), vma->vm_start, vma->vm_end);
-        status = smb_revalidate_inode(dentry);
-        if (status) {
-                PARANOIA("%s/%s validation failed, error=%d\n",
-                         DENTRY_PATH(dentry), status);
-                goto out;
-        }
-        status = generic_file_mmap(file, vma);
-out:
-        return status;
-}
-static ssize_t
-smb_file_splice_read(struct file *file, loff_t *ppos,
-                     struct pipe_inode_info *pipe, size_t count,
-                     unsigned int flags)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        ssize_t status;
-        VERBOSE("file %s/%s, pos=%Ld, count=%lu\n",
-                DENTRY_PATH(dentry), *ppos, count);
-        status = smb_revalidate_inode(dentry);
-        if (status) {
-                PARANOIA("%s/%s validation failed, error=%Zd\n",
-                         DENTRY_PATH(dentry), status);
-                goto out;
-        }
-        status = generic_file_splice_read(file, ppos, pipe, count, flags);
-out:
-        return status;
-}
-/*
- * This does the "real" work of the write. The generic routine has
- * allocated the page, locked it, done all the page alignment stuff
- * calculations etc. Now we should just copy the data from user
- * space and write it back to the real medium..
- *
- * If the writer ends up delaying the write, the writer needs to
- * increment the page use counts until he is done with the page.
- */
-static int smb_write_begin(struct file *file, struct address_space *mapping,
-                        loff_t pos, unsigned len, unsigned flags,
-                        struct page **pagep, void **fsdata)
-{
-        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = grab_cache_page_write_begin(mapping, index, flags);
-        if (!*pagep)
-                return -ENOMEM;
-        return 0;
-}
-static int smb_write_end(struct file *file, struct address_space *mapping,
-                        loff_t pos, unsigned len, unsigned copied,
-                        struct page *page, void *fsdata)
-{
-        int status;
-        unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-        lock_kernel();
-        status = smb_updatepage(file, page, offset, copied);
-        unlock_kernel();
-        if (!status) {
-                if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
-                        SetPageUptodate(page);
-                status = copied;
-        }
-        unlock_page(page);
-        page_cache_release(page);
-        return status;
-}
-const struct address_space_operations smb_file_aops = {
-        .readpage = smb_readpage,
-        .writepage = smb_writepage,
-        .write_begin = smb_write_begin,
-        .write_end = smb_write_end,
-};
-/* 
- * Write to a file (through the page cache).
- */
-static ssize_t
-smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t pos)
-{
-        struct file * file = iocb->ki_filp;
-        struct dentry * dentry = file->f_path.dentry;
-        ssize_t result;
-        VERBOSE("file %s/%s, count=%lu@%lu\n",
-                DENTRY_PATH(dentry),
-                (unsigned long) iocb->ki_left, (unsigned long) pos);
-        result = smb_revalidate_inode(dentry);
-        if (result) {
-                PARANOIA("%s/%s validation failed, error=%Zd\n",
-                         DENTRY_PATH(dentry), result);
-                goto out;
-        }
-        result = smb_open(dentry, SMB_O_WRONLY);
-        if (result)
-                goto out;
-        if (iocb->ki_left > 0) {
-                result = generic_file_aio_write(iocb, iov, nr_segs, pos);
-                VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
-                        (long) file->f_pos, (long) dentry->d_inode->i_size,
-                        dentry->d_inode->i_mtime.tv_sec,
-                        dentry->d_inode->i_atime.tv_sec);
-        }
-out:
-        return result;
-}
-static int
-smb_file_open(struct inode *inode, struct file * file)
-{
-        int result;
-        struct dentry *dentry = file->f_path.dentry;
-        int smb_mode = (file->f_mode & O_ACCMODE) - 1;
-        lock_kernel();
-        result = smb_open(dentry, smb_mode);
-        if (result)
-                goto out;
-        SMB_I(inode)->openers++;
-out:
-        unlock_kernel();
-        return result;
-}
-static int
-smb_file_release(struct inode *inode, struct file * file)
-{
-        lock_kernel();
-        if (!--SMB_I(inode)->openers) {
-                /* We must flush any dirty pages now as we won't be able to
-                   write anything after close. mmap can trigger this.
-                   "openers" should perhaps include mmap'ers ... */
-                filemap_write_and_wait(inode->i_mapping);
-                smb_close(inode);
-        }
-        unlock_kernel();
-        return 0;
-}
-/*
- * Check whether the required access is compatible with
- * an inode's permission. SMB doesn't recognize superuser
- * privileges, so we need our own check for this.
- */
-static int
-smb_file_permission(struct inode *inode, int mask)
-{
-        int mode = inode->i_mode;
-        int error = 0;
-        VERBOSE("mode=%x, mask=%x\n", mode, mask);
-        /* Look at user permissions */
-        mode >>= 6;
-        if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
-                error = -EACCES;
-        return error;
-}
-static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
-{
-        loff_t ret;
-        lock_kernel();
-        ret = generic_file_llseek_unlocked(file, offset, origin);
-        unlock_kernel();
-        return ret;
-}
-const struct file_operations smb_file_operations =
-{
-        .llseek         = smb_remote_llseek,
-        .read           = do_sync_read,
-        .aio_read       = smb_file_aio_read,
-        .write          = do_sync_write,
-        .aio_write      = smb_file_aio_write,
-        .unlocked_ioctl = smb_ioctl,
-        .mmap           = smb_file_mmap,
-        .open           = smb_file_open,
-        .release        = smb_file_release,
-        .fsync          = smb_fsync,
-        .splice_read    = smb_file_splice_read,
-};
-const struct inode_operations smb_file_inode_operations =
-{
-        .permission     = smb_file_permission,
-        .getattr        = smb_getattr,
-        .setattr        = smb_notify_change,
-};
diff --git a/fs/smbfs/getopt.c b/fs/smbfs/getopt.c
deleted file mode 100644
index 7ae0f5273ab1..000000000000
--- a/fs/smbfs/getopt.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * getopt.c
- */
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/net.h>
-#include "getopt.h"
-/**
- *      smb_getopt - option parser
- *      @caller: name of the caller, for error messages
- *      @options: the options string
- *      @opts: an array of &struct option entries controlling parser operations
- *      @optopt: output; will contain the current option
- *      @optarg: output; will contain the value (if one exists)
- *      @flag: output; may be NULL; should point to a long for or'ing flags
- *      @value: output; may be NULL; will be overwritten with the integer value
- *              of the current argument.
- *
- *      Helper to parse options on the format used by mount ("a=b,c=d,e,f").
- *      Returns opts->val if a matching entry in the 'opts' array is found,
- *      0 when no more tokens are found, -1 if an error is encountered.
- */
-int smb_getopt(char *caller, char **options, struct option *opts,
-               char **optopt, char **optarg, unsigned long *flag,
-               unsigned long *value)
-{
-        char *token;
-        char *val;
-        int i;
-        do {
-                if ((token = strsep(options, ",")) == NULL)
-                        return 0;
-        } while (*token == '\0');
-        *optopt = token;
-        *optarg = NULL;
-        if ((val = strchr (token, '=')) != NULL) {
-                *val++ = 0;
-                if (value)
-                        *value = simple_strtoul(val, NULL, 0);
-                *optarg = val;
-        }
-        for (i = 0; opts[i].name != NULL; i++) {
-                if (!strcmp(opts[i].name, token)) {
-                        if (!opts[i].flag && (!val || !*val)) {
-                                printk("%s: the %s option requires an argument\n",
-                                       caller, token);
-                                return -1;
-                        }
-                        if (flag && opts[i].flag)
-                                *flag |= opts[i].flag;
-                        return opts[i].val;
-                }
-        }
-        printk("%s: Unrecognized mount option %s\n", caller, token);
-        return -1;
-}
diff --git a/fs/smbfs/getopt.h b/fs/smbfs/getopt.h
deleted file mode 100644
index 146219ac7c46..000000000000
--- a/fs/smbfs/getopt.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _LINUX_GETOPT_H
-#define _LINUX_GETOPT_H
-struct option {
-        const char *name;
-        unsigned long flag;
-        int val;
-};
-extern int smb_getopt(char *caller, char **options, struct option *opts,
-                      char **optopt, char **optarg, unsigned long *flag,
-                      unsigned long *value);
-#endif /* _LINUX_GETOPT_H */
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
deleted file mode 100644
index 450c91941988..000000000000
--- a/fs/smbfs/inode.c
+++ /dev/null
@@ -1,839 +0,0 @@
-/*
- *  inode.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/file.h>
-#include <linux/dcache.h>
-#include <linux/smp_lock.h>
-#include <linux/nls.h>
-#include <linux/seq_file.h>
-#include <linux/mount.h>
-#include <linux/net.h>
-#include <linux/vfs.h>
-#include <linux/highuid.h>
-#include <linux/sched.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include "smb_debug.h"
-#include "getopt.h"
-#include "proto.h"
-/* Always pick a default string */
-#ifdef CONFIG_SMB_NLS_REMOTE
-#define SMB_NLS_REMOTE CONFIG_SMB_NLS_REMOTE
-#else
-#define SMB_NLS_REMOTE ""
-#endif
-#define SMB_TTL_DEFAULT 1000
-static void smb_evict_inode(struct inode *);
-static void smb_put_super(struct super_block *);
-static int  smb_statfs(struct dentry *, struct kstatfs *);
-static int  smb_show_options(struct seq_file *, struct vfsmount *);
-static struct kmem_cache *smb_inode_cachep;
-static struct inode *smb_alloc_inode(struct super_block *sb)
-{
-        struct smb_inode_info *ei;
-        ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL);
-        if (!ei)
-                return NULL;
-        return &ei->vfs_inode;
-}
-static void smb_destroy_inode(struct inode *inode)
-{
-        kmem_cache_free(smb_inode_cachep, SMB_I(inode));
-}
-static void init_once(void *foo)
-{
-        struct smb_inode_info *ei = (struct smb_inode_info *) foo;
-        inode_init_once(&ei->vfs_inode);
-}
-static int init_inodecache(void)
-{
-        smb_inode_cachep = kmem_cache_create("smb_inode_cache",
-                                             sizeof(struct smb_inode_info),
-                                             0, (SLAB_RECLAIM_ACCOUNT|
-                                                SLAB_MEM_SPREAD),
-                                             init_once);
-        if (smb_inode_cachep == NULL)
-                return -ENOMEM;
-        return 0;
-}
-static void destroy_inodecache(void)
-{
-        kmem_cache_destroy(smb_inode_cachep);
-}
-static int smb_remount(struct super_block *sb, int *flags, char *data)
-{
-        *flags |= MS_NODIRATIME;
-        return 0;
-}
-static const struct super_operations smb_sops =
-{
-        .alloc_inode    = smb_alloc_inode,
-        .destroy_inode  = smb_destroy_inode,
-        .drop_inode     = generic_delete_inode,
-        .evict_inode    = smb_evict_inode,
-        .put_super      = smb_put_super,
-        .statfs         = smb_statfs,
-        .show_options   = smb_show_options,
-        .remount_fs     = smb_remount,
-};
-/* We are always generating a new inode here */
-struct inode *
-smb_iget(struct super_block *sb, struct smb_fattr *fattr)
-{
-        struct smb_sb_info *server = SMB_SB(sb);
-        struct inode *result;
-        DEBUG1("smb_iget: %p\n", fattr);
-        result = new_inode(sb);
-        if (!result)
-                return result;
-        result->i_ino = fattr->f_ino;
-        SMB_I(result)->open = 0;
-        SMB_I(result)->fileid = 0;
-        SMB_I(result)->access = 0;
-        SMB_I(result)->flags = 0;
-        SMB_I(result)->closed = 0;
-        SMB_I(result)->openers = 0;
-        smb_set_inode_attr(result, fattr);
-        if (S_ISREG(result->i_mode)) {
-                result->i_op = &smb_file_inode_operations;
-                result->i_fop = &smb_file_operations;
-                result->i_data.a_ops = &smb_file_aops;
-        } else if (S_ISDIR(result->i_mode)) {
-                if (server->opt.capabilities & SMB_CAP_UNIX)
-                        result->i_op = &smb_dir_inode_operations_unix;
-                else
-                        result->i_op = &smb_dir_inode_operations;
-                result->i_fop = &smb_dir_operations;
-        } else if (S_ISLNK(result->i_mode)) {
-                result->i_op = &smb_link_inode_operations;
-        } else {
-                init_special_inode(result, result->i_mode, fattr->f_rdev);
-        }
-        insert_inode_hash(result);
-        return result;
-}
-/*
- * Copy the inode data to a smb_fattr structure.
- */
-void
-smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
-{
-        memset(fattr, 0, sizeof(struct smb_fattr));
-        fattr->f_mode   = inode->i_mode;
-        fattr->f_nlink  = inode->i_nlink;
-        fattr->f_ino    = inode->i_ino;
-        fattr->f_uid    = inode->i_uid;
-        fattr->f_gid    = inode->i_gid;
-        fattr->f_size   = inode->i_size;
-        fattr->f_mtime  = inode->i_mtime;
-        fattr->f_ctime  = inode->i_ctime;
-        fattr->f_atime  = inode->i_atime;
-        fattr->f_blocks = inode->i_blocks;
-        fattr->attr     = SMB_I(inode)->attr;
-        /*
-         * Keep the attributes in sync with the inode permissions.
-         */
-        if (fattr->f_mode & S_IWUSR)
-                fattr->attr &= ~aRONLY;
-        else
-                fattr->attr |= aRONLY;
-}
-/*
- * Update the inode, possibly causing it to invalidate its pages if mtime/size
- * is different from last time.
- */
-void
-smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
-{
-        struct smb_inode_info *ei = SMB_I(inode);
-        /*
-         * A size change should have a different mtime, or same mtime
-         * but different size.
-         */
-        time_t last_time = inode->i_mtime.tv_sec;
-        loff_t last_sz = inode->i_size;
-        inode->i_mode   = fattr->f_mode;
-        inode->i_nlink  = fattr->f_nlink;
-        inode->i_uid    = fattr->f_uid;
-        inode->i_gid    = fattr->f_gid;
-        inode->i_ctime  = fattr->f_ctime;
-        inode->i_blocks = fattr->f_blocks;
-        inode->i_size   = fattr->f_size;
-        inode->i_mtime  = fattr->f_mtime;
-        inode->i_atime  = fattr->f_atime;
-        ei->attr = fattr->attr;
-        /*
-         * Update the "last time refreshed" field for revalidation.
-         */
-        ei->oldmtime = jiffies;
-        if (inode->i_mtime.tv_sec != last_time || inode->i_size != last_sz) {
-                VERBOSE("%ld changed, old=%ld, new=%ld, oz=%ld, nz=%ld\n",
-                        inode->i_ino,
-                        (long) last_time, (long) inode->i_mtime.tv_sec,
-                        (long) last_sz, (long) inode->i_size);
-                if (!S_ISDIR(inode->i_mode))
-                        invalidate_remote_inode(inode);
-        }
-}
-/*
- * This is called if the connection has gone bad ...
- * try to kill off all the current inodes.
- */
-void
-smb_invalidate_inodes(struct smb_sb_info *server)
-{
-        VERBOSE("\n");
-        shrink_dcache_sb(SB_of(server));
-        invalidate_inodes(SB_of(server));
-}
-/*
- * This is called to update the inode attributes after
- * we've made changes to a file or directory.
- */
-static int
-smb_refresh_inode(struct dentry *dentry)
-{
-        struct inode *inode = dentry->d_inode;
-        int error;
-        struct smb_fattr fattr;
-        error = smb_proc_getattr(dentry, &fattr);
-        if (!error) {
-                smb_renew_times(dentry);
-                /*
-                 * Check whether the type part of the mode changed,
-                 * and don't update the attributes if it did.
-                 *
-                 * And don't dick with the root inode
-                 */
-                if (inode->i_ino == 2)
-                        return error;
-                if (S_ISLNK(inode->i_mode))
-                        return error;   /* VFS will deal with it */
-                if ((inode->i_mode & S_IFMT) == (fattr.f_mode & S_IFMT)) {
-                        smb_set_inode_attr(inode, &fattr);
-                } else {
-                        /*
-                         * Big trouble! The inode has become a new object,
-                         * so any operations attempted on it are invalid.
-                         *
-                         * To limit damage, mark the inode as bad so that
-                         * subsequent lookup validations will fail.
-                         */
-                        PARANOIA("%s/%s changed mode, %07o to %07o\n",
-                                 DENTRY_PATH(dentry),
-                                 inode->i_mode, fattr.f_mode);
-                        fattr.f_mode = inode->i_mode; /* save mode */
-                        make_bad_inode(inode);
-                        inode->i_mode = fattr.f_mode; /* restore mode */
-                        /*
-                         * No need to worry about unhashing the dentry: the
-                         * lookup validation will see that the inode is bad.
-                         * But we do want to invalidate the caches ...
-                         */
-                        if (!S_ISDIR(inode->i_mode))
-                                invalidate_remote_inode(inode);
-                        else
-                                smb_invalid_dir_cache(inode);
-                        error = -EIO;
-                }
-        }
-        return error;
-}
-/*
- * This is called when we want to check whether the inode
- * has changed on the server.  If it has changed, we must
- * invalidate our local caches.
- */
-int
-smb_revalidate_inode(struct dentry *dentry)
-{
-        struct smb_sb_info *s = server_from_dentry(dentry);
-        struct inode *inode = dentry->d_inode;
-        int error = 0;
-        DEBUG1("smb_revalidate_inode\n");
-        lock_kernel();
-        /*
-         * Check whether we've recently refreshed the inode.
-         */
-        if (time_before(jiffies, SMB_I(inode)->oldmtime + SMB_MAX_AGE(s))) {
-                VERBOSE("up-to-date, ino=%ld, jiffies=%lu, oldtime=%lu\n",
-                        inode->i_ino, jiffies, SMB_I(inode)->oldmtime);
-                goto out;
-        }
-        error = smb_refresh_inode(dentry);
-out:
-        unlock_kernel();
-        return error;
-}
-/*
- * This routine is called when i_nlink == 0 and i_count goes to 0.
- * All blocking cleanup operations need to go here to avoid races.
- */
-static void
-smb_evict_inode(struct inode *ino)
-{
-        DEBUG1("ino=%ld\n", ino->i_ino);
-        truncate_inode_pages(&ino->i_data, 0);
-        end_writeback(ino);
-        lock_kernel();
-        if (smb_close(ino))
-                PARANOIA("could not close inode %ld\n", ino->i_ino);
-        unlock_kernel();
-}
-static struct option opts[] = {
-        { "version",    0, 'v' },
-        { "win95",      SMB_MOUNT_WIN95, 1 },
-        { "oldattr",    SMB_MOUNT_OLDATTR, 1 },
-        { "dirattr",    SMB_MOUNT_DIRATTR, 1 },
-        { "case",       SMB_MOUNT_CASE, 1 },
-        { "uid",        0, 'u' },
-        { "gid",        0, 'g' },
-        { "file_mode",  0, 'f' },
-        { "dir_mode",   0, 'd' },
-        { "iocharset",  0, 'i' },
-        { "codepage",   0, 'c' },
-        { "ttl",        0, 't' },
-        { NULL,         0, 0}
-};
-static int
-parse_options(struct smb_mount_data_kernel *mnt, char *options)
-{
-        int c;
-        unsigned long flags;
-        unsigned long value;
-        char *optarg;
-        char *optopt;
-        flags = 0;
-        while ( (c = smb_getopt("smbfs", &options, opts,
-                                &optopt, &optarg, &flags, &value)) > 0) {
-                VERBOSE("'%s' -> '%s'\n", optopt, optarg ? optarg : "<none>");
-                switch (c) {
-                case 1:
-                        /* got a "flag" option */
-                        break;
-                case 'v':
-                        if (value != SMB_MOUNT_VERSION) {
-                        printk ("smbfs: Bad mount version %ld, expected %d\n",
-                                value, SMB_MOUNT_VERSION);
-                                return 0;
-                        }
-                        mnt->version = value;
-                        break;
-                case 'u':
-                        mnt->uid = value;
-                        flags |= SMB_MOUNT_UID;
-                        break;
-                case 'g':
-                        mnt->gid = value;
-                        flags |= SMB_MOUNT_GID;
-                        break;
-                case 'f':
-                        mnt->file_mode = (value & S_IRWXUGO) | S_IFREG;
-                        flags |= SMB_MOUNT_FMODE;
-                        break;
-                case 'd':
-                        mnt->dir_mode = (value & S_IRWXUGO) | S_IFDIR;
-                        flags |= SMB_MOUNT_DMODE;
-                        break;
-                case 'i':
-                        strlcpy(mnt->codepage.local_name, optarg, 
-                                SMB_NLS_MAXNAMELEN);
-                        break;
-                case 'c':
-                        strlcpy(mnt->codepage.remote_name, optarg,
-                                SMB_NLS_MAXNAMELEN);
-                        break;
-                case 't':
-                        mnt->ttl = value;
-                        break;
-                default:
-                        printk ("smbfs: Unrecognized mount option %s\n",
-                                optopt);
-                        return -1;
-                }
-        }
-        mnt->flags = flags;
-        return c;
-}
-/*
- * smb_show_options() is for displaying mount options in /proc/mounts.
- * It tries to avoid showing settings that were not changed from their
- * defaults.
- */
-static int
-smb_show_options(struct seq_file *s, struct vfsmount *m)
-{
-        struct smb_mount_data_kernel *mnt = SMB_SB(m->mnt_sb)->mnt;
-        int i;
-        for (i = 0; opts[i].name != NULL; i++)
-                if (mnt->flags & opts[i].flag)
-                        seq_printf(s, ",%s", opts[i].name);
-        if (mnt->flags & SMB_MOUNT_UID)
-                seq_printf(s, ",uid=%d", mnt->uid);
-        if (mnt->flags & SMB_MOUNT_GID)
-                seq_printf(s, ",gid=%d", mnt->gid);
-        if (mnt->mounted_uid != 0)
-                seq_printf(s, ",mounted_uid=%d", mnt->mounted_uid);
-        /* 
-         * Defaults for file_mode and dir_mode are unknown to us; they
-         * depend on the current umask of the user doing the mount.
-         */
-        if (mnt->flags & SMB_MOUNT_FMODE)
-                seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO);
-        if (mnt->flags & SMB_MOUNT_DMODE)
-                seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO);
-        if (strcmp(mnt->codepage.local_name, CONFIG_NLS_DEFAULT))
-                seq_printf(s, ",iocharset=%s", mnt->codepage.local_name);
-        if (strcmp(mnt->codepage.remote_name, SMB_NLS_REMOTE))
-                seq_printf(s, ",codepage=%s", mnt->codepage.remote_name);
-        if (mnt->ttl != SMB_TTL_DEFAULT)
-                seq_printf(s, ",ttl=%d", mnt->ttl);
-        return 0;
-}
-static void
-smb_unload_nls(struct smb_sb_info *server)
-{
-        unload_nls(server->remote_nls);
-        unload_nls(server->local_nls);
-}
-static void
-smb_put_super(struct super_block *sb)
-{
-        struct smb_sb_info *server = SMB_SB(sb);
-        lock_kernel();
-        smb_lock_server(server);
-        server->state = CONN_INVALID;
-        smbiod_unregister_server(server);
-        smb_close_socket(server);
-        if (server->conn_pid)
-                kill_pid(server->conn_pid, SIGTERM, 1);
-        bdi_destroy(&server->bdi);
-        kfree(server->ops);
-        smb_unload_nls(server);
-        sb->s_fs_info = NULL;
-        smb_unlock_server(server);
-        put_pid(server->conn_pid);
-        kfree(server);
-        unlock_kernel();
-}
-static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
-{
-        struct smb_sb_info *server;
-        struct smb_mount_data_kernel *mnt;
-        struct smb_mount_data *oldmnt;
-        struct inode *root_inode;
-        struct smb_fattr root;
-        int ver;
-        void *mem;
-        static int warn_count;
-        if (warn_count < 5) {
-                warn_count++;
-                printk(KERN_EMERG "smbfs is deprecated and will be removed"
-                        " from the 2.6.27 kernel. Please migrate to cifs\n");
-        }
-        if (!raw_data)
-                goto out_no_data;
-        oldmnt = (struct smb_mount_data *) raw_data;
-        ver = oldmnt->version;
-        if (ver != SMB_MOUNT_OLDVERSION && cpu_to_be32(ver) != SMB_MOUNT_ASCII)
-                goto out_wrong_data;
-        sb->s_flags |= MS_NODIRATIME;
-        sb->s_blocksize = 1024; /* Eh...  Is this correct? */
-        sb->s_blocksize_bits = 10;
-        sb->s_magic = SMB_SUPER_MAGIC;
-        sb->s_op = &smb_sops;
-        sb->s_time_gran = 100;
-        server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
-        if (!server)
-                goto out_no_server;
-        sb->s_fs_info = server;
-        
-        if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
-                goto out_bdi;
-        sb->s_bdi = &server->bdi;
-        server->super_block = sb;
-        server->mnt = NULL;
-        server->sock_file = NULL;
-        init_waitqueue_head(&server->conn_wq);
-        init_MUTEX(&server->sem);
-        INIT_LIST_HEAD(&server->entry);
-        INIT_LIST_HEAD(&server->xmitq);
-        INIT_LIST_HEAD(&server->recvq);
-        server->conn_error = 0;
-        server->conn_pid = NULL;
-        server->state = CONN_INVALID; /* no connection yet */
-        server->generation = 0;
-        /* Allocate the global temp buffer and some superblock helper structs */
-        /* FIXME: move these to the smb_sb_info struct */
-        VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) +
-                sizeof(struct smb_mount_data_kernel));
-        mem = kmalloc(sizeof(struct smb_ops) +
-                      sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
-        if (!mem)
-                goto out_no_mem;
-        server->ops = mem;
-        smb_install_null_ops(server->ops);
-        server->mnt = mem + sizeof(struct smb_ops);
-        /* Setup NLS stuff */
-        server->remote_nls = NULL;
-        server->local_nls = NULL;
-        mnt = server->mnt;
-        memset(mnt, 0, sizeof(struct smb_mount_data_kernel));
-        strlcpy(mnt->codepage.local_name, CONFIG_NLS_DEFAULT,
-                SMB_NLS_MAXNAMELEN);
-        strlcpy(mnt->codepage.remote_name, SMB_NLS_REMOTE,
-                SMB_NLS_MAXNAMELEN);
-        mnt->ttl = SMB_TTL_DEFAULT;
-        if (ver == SMB_MOUNT_OLDVERSION) {
-                mnt->version = oldmnt->version;
-                SET_UID(mnt->uid, oldmnt->uid);
-                SET_GID(mnt->gid, oldmnt->gid);
-                mnt->file_mode = (oldmnt->file_mode & S_IRWXUGO) | S_IFREG;
-                mnt->dir_mode = (oldmnt->dir_mode & S_IRWXUGO) | S_IFDIR;
-                mnt->flags = (oldmnt->file_mode >> 9) | SMB_MOUNT_UID |
-                        SMB_MOUNT_GID | SMB_MOUNT_FMODE | SMB_MOUNT_DMODE;
-        } else {
-                mnt->file_mode = S_IRWXU | S_IRGRP | S_IXGRP |
-                                S_IROTH | S_IXOTH | S_IFREG;
-                mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP |
-                                S_IROTH | S_IXOTH | S_IFDIR;
-                if (parse_options(mnt, raw_data))
-                        goto out_bad_option;
-        }
-        mnt->mounted_uid = current_uid();
-        smb_setcodepage(server, &mnt->codepage);
-        /*
-         * Display the enabled options
-         * Note: smb_proc_getattr uses these in 2.4 (but was changed in 2.2)
-         */
-        if (mnt->flags & SMB_MOUNT_OLDATTR)
-                printk("SMBFS: Using core getattr (Win 95 speedup)\n");
-        else if (mnt->flags & SMB_MOUNT_DIRATTR)
-                printk("SMBFS: Using dir ff getattr\n");
-        if (smbiod_register_server(server) < 0) {
-                printk(KERN_ERR "smbfs: failed to start smbiod\n");
-                goto out_no_smbiod;
-        }
-        /*
-         * Keep the super block locked while we get the root inode.
-         */
-        smb_init_root_dirent(server, &root, sb);
-        root_inode = smb_iget(sb, &root);
-        if (!root_inode)
-                goto out_no_root;
-        sb->s_root = d_alloc_root(root_inode);
-        if (!sb->s_root)
-                goto out_no_root;
-        smb_new_dentry(sb->s_root);
-        return 0;
-out_no_root:
-        iput(root_inode);
-out_no_smbiod:
-        smb_unload_nls(server);
-out_bad_option:
-        kfree(mem);
-out_no_mem:
-        bdi_destroy(&server->bdi);
-out_bdi:
-        if (!server->mnt)
-                printk(KERN_ERR "smb_fill_super: allocation failure\n");
-        sb->s_fs_info = NULL;
-        kfree(server);
-        goto out_fail;
-out_wrong_data:
-        printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver);
-        goto out_fail;
-out_no_data:
-        printk(KERN_ERR "smb_fill_super: missing data argument\n");
-out_fail:
-        return -EINVAL;
-out_no_server:
-        printk(KERN_ERR "smb_fill_super: cannot allocate struct smb_sb_info\n");
-        return -ENOMEM;
-}
-static int
-smb_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        int result;
-        
-        lock_kernel();
-        result = smb_proc_dskattr(dentry, buf);
-        unlock_kernel();
-        buf->f_type = SMB_SUPER_MAGIC;
-        buf->f_namelen = SMB_MAXPATHLEN;
-        return result;
-}
-int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
-{
-        int err = smb_revalidate_inode(dentry);
-        if (!err)
-                generic_fillattr(dentry->d_inode, stat);
-        return err;
-}
-int
-smb_notify_change(struct dentry *dentry, struct iattr *attr)
-{
-        struct inode *inode = dentry->d_inode;
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        unsigned int mask = (S_IFREG | S_IFDIR | S_IRWXUGO);
-        int error, changed, refresh = 0;
-        struct smb_fattr fattr;
-        lock_kernel();
-        error = smb_revalidate_inode(dentry);
-        if (error)
-                goto out;
-        if ((error = inode_change_ok(inode, attr)) < 0)
-                goto out;
-        error = -EPERM;
-        if ((attr->ia_valid & ATTR_UID) && (attr->ia_uid != server->mnt->uid))
-                goto out;
-        if ((attr->ia_valid & ATTR_GID) && (attr->ia_uid != server->mnt->gid))
-                goto out;
-        if ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~mask))
-                goto out;
-        if ((attr->ia_valid & ATTR_SIZE) != 0) {
-                VERBOSE("changing %s/%s, old size=%ld, new size=%ld\n",
-                        DENTRY_PATH(dentry),
-                        (long) inode->i_size, (long) attr->ia_size);
-                filemap_write_and_wait(inode->i_mapping);
-                error = smb_open(dentry, O_WRONLY);
-                if (error)
-                        goto out;
-                error = server->ops->truncate(inode, attr->ia_size);
-                if (error)
-                        goto out;
-                truncate_setsize(inode, attr->ia_size);
-                refresh = 1;
-        }
-        if (server->opt.capabilities & SMB_CAP_UNIX) {
-                /* For now we don't want to set the size with setattr_unix */
-                attr->ia_valid &= ~ATTR_SIZE;
-                /* FIXME: only call if we actually want to set something? */
-                error = smb_proc_setattr_unix(dentry, attr, 0, 0);
-                if (!error)
-                        refresh = 1;
-                goto out;
-        }
-        /*
-         * Initialize the fattr and check for changed fields.
-         * Note: CTIME under SMB is creation time rather than
-         * change time, so we don't attempt to change it.
-         */
-        smb_get_inode_attr(inode, &fattr);
-        changed = 0;
-        if ((attr->ia_valid & ATTR_MTIME) != 0) {
-                fattr.f_mtime = attr->ia_mtime;
-                changed = 1;
-        }
-        if ((attr->ia_valid & ATTR_ATIME) != 0) {
-                fattr.f_atime = attr->ia_atime;
-                /* Earlier protocols don't have an access time */
-                if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2)
-                        changed = 1;
-        }
-        if (changed) {
-                error = smb_proc_settime(dentry, &fattr);
-                if (error)
-                        goto out;
-                refresh = 1;
-        }
-        /*
-         * Check for mode changes ... we're extremely limited in
-         * what can be set for SMB servers: just the read-only bit.
-         */
-        if ((attr->ia_valid & ATTR_MODE) != 0) {
-                VERBOSE("%s/%s mode change, old=%x, new=%x\n",
-                        DENTRY_PATH(dentry), fattr.f_mode, attr->ia_mode);
-                changed = 0;
-                if (attr->ia_mode & S_IWUSR) {
-                        if (fattr.attr & aRONLY) {
-                                fattr.attr &= ~aRONLY;
-                                changed = 1;
-                        }
-                } else {
-                        if (!(fattr.attr & aRONLY)) {
-                                fattr.attr |= aRONLY;
-                                changed = 1;
-                        }
-                }
-                if (changed) {
-                        error = smb_proc_setattr(dentry, &fattr);
-                        if (error)
-                                goto out;
-                        refresh = 1;
-                }
-        }
-        error = 0;
-out:
-        if (refresh)
-                smb_refresh_inode(dentry);
-        unlock_kernel();
-        return error;
-}
-static int smb_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
-}
-static struct file_system_type smb_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "smbfs",
-        .get_sb         = smb_get_sb,
-        .kill_sb        = kill_anon_super,
-        .fs_flags       = FS_BINARY_MOUNTDATA,
-};
-static int __init init_smb_fs(void)
-{
-        int err;
-        DEBUG1("registering ...\n");
-        err = init_inodecache();
-        if (err)
-                goto out_inode;
-        err = smb_init_request_cache();
-        if (err)
-                goto out_request;
-        err = register_filesystem(&smb_fs_type);
-        if (err)
-                goto out;
-        return 0;
-out:
-        smb_destroy_request_cache();
-out_request:
-        destroy_inodecache();
-out_inode:
-        return err;
-}
-static void __exit exit_smb_fs(void)
-{
-        DEBUG1("unregistering ...\n");
-        unregister_filesystem(&smb_fs_type);
-        smb_destroy_request_cache();
-        destroy_inodecache();
-}
-module_init(init_smb_fs)
-module_exit(exit_smb_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
deleted file mode 100644
index 07215312ad39..000000000000
--- a/fs/smbfs/ioctl.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *  ioctl.c
- *
- *  Copyright (C) 1995, 1996 by Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/ioctl.h>
-#include <linux/time.h>
-#include <linux/mm.h>
-#include <linux/highuid.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/smb_fs.h>
-#include <linux/smb_mount.h>
-#include <asm/uaccess.h>
-#include "proto.h"
-long
-smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-        struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
-        struct smb_conn_opt opt;
-        int result = -EINVAL;
-        lock_kernel();
-        switch (cmd) {
-                uid16_t uid16;
-                uid_t uid32;
-        case SMB_IOC_GETMOUNTUID:
-                SET_UID(uid16, server->mnt->mounted_uid);
-                result = put_user(uid16, (uid16_t __user *) arg);
-                break;
-        case SMB_IOC_GETMOUNTUID32:
-                SET_UID(uid32, server->mnt->mounted_uid);
-                result = put_user(uid32, (uid_t __user *) arg);
-                break;
-        case SMB_IOC_NEWCONN:
-                /* arg is smb_conn_opt, or NULL if no connection was made */
-                if (!arg) {
-                        result = 0;
-                        smb_lock_server(server);
-                        server->state = CONN_RETRIED;
-                        printk(KERN_ERR "Connection attempt failed!  [%d]\n",
-                               server->conn_error);
-                        smbiod_flush(server);
-                        smb_unlock_server(server);
-                        break;
-                }
-                result = -EFAULT;
-                if (!copy_from_user(&opt, (void __user *)arg, sizeof(opt)))
-                        result = smb_newconn(server, &opt);
-                break;
-        default:
-                break;
-        }
-        unlock_kernel();
-        return result;
-}
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
deleted file mode 100644
index 71c29b6670b4..000000000000
--- a/fs/smbfs/proc.c
+++ /dev/null
@@ -1,3507 +0,0 @@
-/*
- *  proc.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/dcache.h>
-#include <linux/nls.h>
-#include <linux/smp_lock.h>
-#include <linux/net.h>
-#include <linux/vfs.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-#include <net/sock.h>
-#include <asm/string.h>
-#include <asm/div64.h>
-#include "smb_debug.h"
-#include "proto.h"
-#include "request.h"
-/* Features. Undefine if they cause problems, this should perhaps be a
-   config option. */
-#define SMBFS_POSIX_UNLINK 1
-/* Allow smb_retry to be interrupted. */
-#define SMB_RETRY_INTR
-#define SMB_VWV(packet)  ((packet) + SMB_HEADER_LEN)
-#define SMB_CMD(packet)  (*(packet+8))
-#define SMB_WCT(packet)  (*(packet+SMB_HEADER_LEN - 1))
-#define SMB_DIRINFO_SIZE 43
-#define SMB_STATUS_SIZE  21
-#define SMB_ST_BLKSIZE  (PAGE_SIZE)
-#define SMB_ST_BLKSHIFT (PAGE_SHIFT)
-static struct smb_ops smb_ops_core;
-static struct smb_ops smb_ops_os2;
-static struct smb_ops smb_ops_win95;
-static struct smb_ops smb_ops_winNT;
-static struct smb_ops smb_ops_unix;
-static struct smb_ops smb_ops_null;
-static void
-smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
-static void
-smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
-static int
-smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
-                      struct smb_fattr *fattr);
-static int
-smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
-                    struct smb_fattr *fattr);
-static int
-smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
-                      u16 attr);
-static int
-smb_proc_setattr_ext(struct smb_sb_info *server,
-                     struct inode *inode, struct smb_fattr *fattr);
-static int
-smb_proc_query_cifsunix(struct smb_sb_info *server);
-static void
-install_ops(struct smb_ops *dst, struct smb_ops *src);
-static void
-str_upper(char *name, int len)
-{
-        while (len--)
-        {
-                if (*name >= 'a' && *name <= 'z')
-                        *name -= ('a' - 'A');
-                name++;
-        }
-}
-#if 0
-static void
-str_lower(char *name, int len)
-{
-        while (len--)
-        {
-                if (*name >= 'A' && *name <= 'Z')
-                        *name += ('a' - 'A');
-                name++;
-        }
-}
-#endif
-/* reverse a string inline. This is used by the dircache walking routines */
-static void reverse_string(char *buf, int len)
-{
-        char c;
-        char *end = buf+len-1;
-        while(buf < end) {
-                c = *buf;
-                *(buf++) = *end;
-                *(end--) = c;
-        }
-}
-/* no conversion, just a wrapper for memcpy. */
-static int convert_memcpy(unsigned char *output, int olen,
-                          const unsigned char *input, int ilen,
-                          struct nls_table *nls_from,
-                          struct nls_table *nls_to)
-{
-        if (olen < ilen)
-                return -ENAMETOOLONG;
-        memcpy(output, input, ilen);
-        return ilen;
-}
-static inline int write_char(unsigned char ch, char *output, int olen)
-{
-        if (olen < 4)
-                return -ENAMETOOLONG;
-        sprintf(output, ":x%02x", ch);
-        return 4;
-}
-static inline int write_unichar(wchar_t ch, char *output, int olen)
-{
-        if (olen < 5)
-                return -ENAMETOOLONG;
-        sprintf(output, ":%04x", ch);
-        return 5;
-}
-/* convert from one "codepage" to another (possibly being utf8). */
-static int convert_cp(unsigned char *output, int olen,
-                      const unsigned char *input, int ilen,
-                      struct nls_table *nls_from,
-                      struct nls_table *nls_to)
-{
-        int len = 0;
-        int n;
-        wchar_t ch;
-        while (ilen > 0) {
-                /* convert by changing to unicode and back to the new cp */
-                n = nls_from->char2uni(input, ilen, &ch);
-                if (n == -EINVAL) {
-                        ilen--;
-                        n = write_char(*input++, output, olen);
-                        if (n < 0)
-                                goto fail;
-                        output += n;
-                        olen -= n;
-                        len += n;
-                        continue;
-                } else if (n < 0)
-                        goto fail;
-                input += n;
-                ilen -= n;
-                n = nls_to->uni2char(ch, output, olen);
-                if (n == -EINVAL)
-                        n = write_unichar(ch, output, olen);
-                if (n < 0)
-                        goto fail;
-                output += n;
-                olen -= n;
-                len += n;
-        }
-        return len;
-fail:
-        return n;
-}
-/* ----------------------------------------------------------- */
-/*
- * nls_unicode
- *
- * This encodes/decodes little endian unicode format
- */
-static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
-{
-        if (boundlen < 2)
-                return -EINVAL;
-        *out++ = uni & 0xff;
-        *out++ = uni >> 8;
-        return 2;
-}
-static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
-{
-        if (boundlen < 2)
-                return -EINVAL;
-        *uni = (rawstring[1] << 8) | rawstring[0];
-        return 2;
-}
-static struct nls_table unicode_table = {
-        .charset        = "unicode",
-        .uni2char       = uni2char,
-        .char2uni       = char2uni,
-};
-/* ----------------------------------------------------------- */
-static int setcodepage(struct nls_table **p, char *name)
-{
-        struct nls_table *nls;
-        if (!name || !*name) {
-                nls = NULL;
-        } else if ( (nls = load_nls(name)) == NULL) {
-                printk (KERN_ERR "smbfs: failed to load nls '%s'\n", name);
-                return -EINVAL;
-        }
-        /* if already set, unload the previous one. */
-        if (*p && *p != &unicode_table)
-                unload_nls(*p);
-        *p = nls;
-        return 0;
-}
-/* Handles all changes to codepage settings. */
-int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp)
-{
-        int n = 0;
-        smb_lock_server(server);
-        /* Don't load any nls_* at all, if no remote is requested */
-        if (!*cp->remote_name)
-                goto out;
-        /* local */
-        n = setcodepage(&server->local_nls, cp->local_name);
-        if (n != 0)
-                goto out;
-        /* remote */
-        if (!strcmp(cp->remote_name, "unicode")) {
-                server->remote_nls = &unicode_table;
-        } else {
-                n = setcodepage(&server->remote_nls, cp->remote_name);
-                if (n != 0)
-                        setcodepage(&server->local_nls, NULL);
-        }
-out:
-        if (server->local_nls != NULL && server->remote_nls != NULL)
-                server->ops->convert = convert_cp;
-        else
-                server->ops->convert = convert_memcpy;
-        smb_unlock_server(server);
-        return n;
-}
-/*****************************************************************************/
-/*                                                                           */
-/*  Encoding/Decoding section                                                */
-/*                                                                           */
-/*****************************************************************************/
-static __u8 *
-smb_encode_smb_length(__u8 * p, __u32 len)
-{
-        *p = 0;
-        *(p+1) = 0;
-        *(p+2) = (len & 0xFF00) >> 8;
-        *(p+3) = (len & 0xFF);
-        if (len > 0xFFFF)
-        {
-                *(p+1) = 1;
-        }
-        return p + 4;
-}
-/*
- * smb_build_path: build the path to entry and name storing it in buf.
- * The path returned will have the trailing '\0'.
- */
-static int smb_build_path(struct smb_sb_info *server, unsigned char *buf,
-                          int maxlen,
-                          struct dentry *entry, struct qstr *name)
-{
-        unsigned char *path = buf;
-        int len;
-        int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE) != 0;
-        if (maxlen < (2<<unicode))
-                return -ENAMETOOLONG;
-        if (maxlen > SMB_MAXPATHLEN + 1)
-                maxlen = SMB_MAXPATHLEN + 1;
-        if (entry == NULL)
-                goto test_name_and_out;
-        /*
-         * If IS_ROOT, we have to do no walking at all.
-         */
-        if (IS_ROOT(entry) && !name) {
-                *path++ = '\\';
-                if (unicode) *path++ = '\0';
-                *path++ = '\0';
-                if (unicode) *path++ = '\0';
-                return path-buf;
-        }
-        /*
-         * Build the path string walking the tree backward from end to ROOT
-         * and store it in reversed order [see reverse_string()]
-         */
-        dget(entry);
-        spin_lock(&entry->d_lock);
-        while (!IS_ROOT(entry)) {
-                struct dentry *parent;
-                if (maxlen < (3<<unicode)) {
-                        spin_unlock(&entry->d_lock);
-                        dput(entry);
-                        return -ENAMETOOLONG;
-                }
-                len = server->ops->convert(path, maxlen-2, 
-                                      entry->d_name.name, entry->d_name.len,
-                                      server->local_nls, server->remote_nls);
-                if (len < 0) {
-                        spin_unlock(&entry->d_lock);
-                        dput(entry);
-                        return len;
-                }
-                reverse_string(path, len);
-                path += len;
-                if (unicode) {
-                        /* Note: reverse order */
-                        *path++ = '\0';
-                        maxlen--;
-                }
-                *path++ = '\\';
-                maxlen -= len+1;
-                parent = entry->d_parent;
-                dget(parent);
-                spin_unlock(&entry->d_lock);
-                dput(entry);
-                entry = parent;
-                spin_lock(&entry->d_lock);
-        }
-        spin_unlock(&entry->d_lock);
-        dput(entry);
-        reverse_string(buf, path-buf);
-        /* maxlen has space for at least one char */
-test_name_and_out:
-        if (name) {
-                if (maxlen < (3<<unicode))
-                        return -ENAMETOOLONG;
-                *path++ = '\\';
-                if (unicode) {
-                        *path++ = '\0';
-                        maxlen--;
-                }
-                len = server->ops->convert(path, maxlen-2, 
-                                      name->name, name->len,
-                                      server->local_nls, server->remote_nls);
-                if (len < 0)
-                        return len;
-                path += len;
-                maxlen -= len+1;
-        }
-        /* maxlen has space for at least one char */
-        *path++ = '\0';
-        if (unicode) *path++ = '\0';
-        return path-buf;
-}
-static int smb_encode_path(struct smb_sb_info *server, char *buf, int maxlen,
-                           struct dentry *dir, struct qstr *name)
-{
-        int result;
-        result = smb_build_path(server, buf, maxlen, dir, name);
-        if (result < 0)
-                goto out;
-        if (server->opt.protocol <= SMB_PROTOCOL_COREPLUS)
-                str_upper(buf, result);
-out:
-        return result;
-}
-/* encode_path for non-trans2 request SMBs */
-static int smb_simple_encode_path(struct smb_request *req, char **p,
-                                  struct dentry * entry, struct qstr * name)
-{
-        struct smb_sb_info *server = req->rq_server;
-        char *s = *p;
-        int res;
-        int maxlen = ((char *)req->rq_buffer + req->rq_bufsize) - s;
-        int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
-        if (!maxlen)
-                return -ENAMETOOLONG;
-        *s++ = 4;       /* ASCII data format */
-        /*
-         * SMB Unicode strings must be 16bit aligned relative the start of the
-         * packet. If they are not they must be padded with 0.
-         */
-        if (unicode) {
-                int align = s - (char *)req->rq_buffer;
-                if (!(align & 1)) {
-                        *s++ = '\0';
-                        maxlen--;
-                }
-        }
-        res = smb_encode_path(server, s, maxlen-1, entry, name);
-        if (res < 0)
-                return res;
-        *p = s + res;
-        return 0;
-}
-/* The following are taken directly from msdos-fs */
-/* Linear day numbers of the respective 1sts in non-leap years. */
-static int day_n[] =
-{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
-                  /* JanFebMarApr May Jun Jul Aug Sep Oct Nov Dec */
-static time_t
-utc2local(struct smb_sb_info *server, time_t time)
-{
-        return time - server->opt.serverzone*60;
-}
-static time_t
-local2utc(struct smb_sb_info *server, time_t time)
-{
-        return time + server->opt.serverzone*60;
-}
-/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
-static time_t
-date_dos2unix(struct smb_sb_info *server, __u16 date, __u16 time)
-{
-        int month, year;
-        time_t secs;
-        /* first subtract and mask after that... Otherwise, if
-           date == 0, bad things happen */
-        month = ((date >> 5) - 1) & 15;
-        year = date >> 9;
-        secs = (time & 31) * 2 + 60 * ((time >> 5) & 63) + (time >> 11) * 3600 + 86400 *
-            ((date & 31) - 1 + day_n[month] + (year / 4) + year * 365 - ((year & 3) == 0 &&
-                                                   month < 2 ? 1 : 0) + 3653);
-        /* days since 1.1.70 plus 80's leap day */
-        return local2utc(server, secs);
-}
-/* Convert linear UNIX date to a MS-DOS time/date pair. */
-static void
-date_unix2dos(struct smb_sb_info *server,
-              int unix_date, __u16 *date, __u16 *time)
-{
-        int day, year, nl_day, month;
-        unix_date = utc2local(server, unix_date);
-        if (unix_date < 315532800)
-                unix_date = 315532800;
-        *time = (unix_date % 60) / 2 +
-                (((unix_date / 60) % 60) << 5) +
-                (((unix_date / 3600) % 24) << 11);
-        day = unix_date / 86400 - 3652;
-        year = day / 365;
-        if ((year + 3) / 4 + 365 * year > day)
-                year--;
-        day -= (year + 3) / 4 + 365 * year;
-        if (day == 59 && !(year & 3)) {
-                nl_day = day;
-                month = 2;
-        } else {
-                nl_day = (year & 3) || day <= 59 ? day : day - 1;
-                for (month = 1; month < 12; month++)
-                        if (day_n[month] > nl_day)
-                                break;
-        }
-        *date = nl_day - day_n[month - 1] + 1 + (month << 5) + (year << 9);
-}
-/* The following are taken from fs/ntfs/util.c */
-#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
-/*
- * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
- * into Unix UTC (based 1970-01-01, in seconds).
- */
-static struct timespec
-smb_ntutc2unixutc(u64 ntutc)
-{
-        struct timespec ts;
-        /* FIXME: what about the timezone difference? */
-        /* Subtract the NTFS time offset, then convert to 1s intervals. */
-        u64 t = ntutc - NTFS_TIME_OFFSET;
-        ts.tv_nsec = do_div(t, 10000000) * 100;
-        ts.tv_sec = t; 
-        return ts;
-}
-/* Convert the Unix UTC into NT time */
-static u64
-smb_unixutc2ntutc(struct timespec ts)
-{
-        /* Note: timezone conversion is probably wrong. */
-        /* return ((u64)utc2local(server, t)) * 10000000 + NTFS_TIME_OFFSET; */
-        return ((u64)ts.tv_sec) * 10000000 + ts.tv_nsec/100 + NTFS_TIME_OFFSET;
-}
-#define MAX_FILE_MODE   6
-static mode_t file_mode[] = {
-        S_IFREG, S_IFDIR, S_IFLNK, S_IFCHR, S_IFBLK, S_IFIFO, S_IFSOCK
-};
-static int smb_filetype_to_mode(u32 filetype)
-{
-        if (filetype > MAX_FILE_MODE) {
-                PARANOIA("Filetype out of range: %d\n", filetype);
-                return S_IFREG;
-        }
-        return file_mode[filetype];
-}
-static u32 smb_filetype_from_mode(int mode)
-{
-        if (S_ISREG(mode))
-                return UNIX_TYPE_FILE;
-        if (S_ISDIR(mode))
-                return UNIX_TYPE_DIR;
-        if (S_ISLNK(mode))
-                return UNIX_TYPE_SYMLINK;
-        if (S_ISCHR(mode))
-                return UNIX_TYPE_CHARDEV;
-        if (S_ISBLK(mode))
-                return UNIX_TYPE_BLKDEV;
-        if (S_ISFIFO(mode))
-                return UNIX_TYPE_FIFO;
-        if (S_ISSOCK(mode))
-                return UNIX_TYPE_SOCKET;
-        return UNIX_TYPE_UNKNOWN;
-}
-/*****************************************************************************/
-/*                                                                           */
-/*  Support section.                                                         */
-/*                                                                           */
-/*****************************************************************************/
-__u32
-smb_len(__u8 * p)
-{
-        return ((*(p+1) & 0x1) << 16L) | (*(p+2) << 8L) | *(p+3);
-}
-static __u16
-smb_bcc(__u8 * packet)
-{
-        int pos = SMB_HEADER_LEN + SMB_WCT(packet) * sizeof(__u16);
-        return WVAL(packet, pos);
-}
-/* smb_valid_packet: We check if packet fulfills the basic
-   requirements of a smb packet */
-static int
-smb_valid_packet(__u8 * packet)
-{
-        return (packet[4] == 0xff
-                && packet[5] == 'S'
-                && packet[6] == 'M'
-                && packet[7] == 'B'
-                && (smb_len(packet) + 4 == SMB_HEADER_LEN
-                    + SMB_WCT(packet) * 2 + smb_bcc(packet)));
-}
-/* smb_verify: We check if we got the answer we expected, and if we
-   got enough data. If bcc == -1, we don't care. */
-static int
-smb_verify(__u8 * packet, int command, int wct, int bcc)
-{
-        if (SMB_CMD(packet) != command)
-                goto bad_command;
-        if (SMB_WCT(packet) < wct)
-                goto bad_wct;
-        if (bcc != -1 && smb_bcc(packet) < bcc)
-                goto bad_bcc;
-        return 0;
-bad_command:
-        printk(KERN_ERR "smb_verify: command=%x, SMB_CMD=%x??\n",
-               command, SMB_CMD(packet));
-        goto fail;
-bad_wct:
-        printk(KERN_ERR "smb_verify: command=%x, wct=%d, SMB_WCT=%d??\n",
-               command, wct, SMB_WCT(packet));
-        goto fail;
-bad_bcc:
-        printk(KERN_ERR "smb_verify: command=%x, bcc=%d, SMB_BCC=%d??\n",
-               command, bcc, smb_bcc(packet));
-fail:
-        return -EIO;
-}
-/*
- * Returns the maximum read or write size for the "payload". Making all of the
- * packet fit within the negotiated max_xmit size.
- *
- * N.B. Since this value is usually computed before locking the server,
- * the server's packet size must never be decreased!
- */
-static inline int
-smb_get_xmitsize(struct smb_sb_info *server, int overhead)
-{
-        return server->opt.max_xmit - overhead;
-}
-/*
- * Calculate the maximum read size
- */
-int
-smb_get_rsize(struct smb_sb_info *server)
-{
-        /* readX has 12 parameters, read has 5 */
-        int overhead = SMB_HEADER_LEN + 12 * sizeof(__u16) + 2 + 1 + 2;
-        int size = smb_get_xmitsize(server, overhead);
-        VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
-        return size;
-}
-/*
- * Calculate the maximum write size
- */
-int
-smb_get_wsize(struct smb_sb_info *server)
-{
-        /* writeX has 14 parameters, write has 5 */
-        int overhead = SMB_HEADER_LEN + 14 * sizeof(__u16) + 2 + 1 + 2;
-        int size = smb_get_xmitsize(server, overhead);
-        VERBOSE("xmit=%d, size=%d\n", server->opt.max_xmit, size);
-        return size;
-}
-/*
- * Convert SMB error codes to -E... errno values.
- */
-int
-smb_errno(struct smb_request *req)
-{
-        int errcls = req->rq_rcls;
-        int error  = req->rq_err;
-        char *class = "Unknown";
-        VERBOSE("errcls %d  code %d  from command 0x%x\n",
-                errcls, error, SMB_CMD(req->rq_header));
-        if (errcls == ERRDOS) {
-                switch (error) {
-                case ERRbadfunc:
-                        return -EINVAL;
-                case ERRbadfile:
-                case ERRbadpath:
-                        return -ENOENT;
-                case ERRnofids:
-                        return -EMFILE;
-                case ERRnoaccess:
-                        return -EACCES;
-                case ERRbadfid:
-                        return -EBADF;
-                case ERRbadmcb:
-                        return -EREMOTEIO;
-                case ERRnomem:
-                        return -ENOMEM;
-                case ERRbadmem:
-                        return -EFAULT;
-                case ERRbadenv:
-                case ERRbadformat:
-                        return -EREMOTEIO;
-                case ERRbadaccess:
-                        return -EACCES;
-                case ERRbaddata:
-                        return -E2BIG;
-                case ERRbaddrive:
-                        return -ENXIO;
-                case ERRremcd:
-                        return -EREMOTEIO;
-                case ERRdiffdevice:
-                        return -EXDEV;
-                case ERRnofiles:
-                        return -ENOENT;
-                case ERRbadshare:
-                        return -ETXTBSY;
-                case ERRlock:
-                        return -EDEADLK;
-                case ERRfilexists:
-                        return -EEXIST;
-                case ERROR_INVALID_PARAMETER:
-                        return -EINVAL;
-                case ERROR_DISK_FULL:
-                        return -ENOSPC;
-                case ERROR_INVALID_NAME:
-                        return -ENOENT;
-                case ERROR_DIR_NOT_EMPTY:
-                        return -ENOTEMPTY;
-                case ERROR_NOT_LOCKED:
-                       return -ENOLCK;
-                case ERROR_ALREADY_EXISTS:
-                        return -EEXIST;
-                default:
-                        class = "ERRDOS";
-                        goto err_unknown;
-                }
-        } else if (errcls == ERRSRV) {
-                switch (error) {
-                /* N.B. This is wrong ... EIO ? */
-                case ERRerror:
-                        return -ENFILE;
-                case ERRbadpw:
-                        return -EINVAL;
-                case ERRbadtype:
-                case ERRtimeout:
-                        return -EIO;
-                case ERRaccess:
-                        return -EACCES;
-                /*
-                 * This is a fatal error, as it means the "tree ID"
-                 * for this connection is no longer valid. We map
-                 * to a special error code and get a new connection.
-                 */
-                case ERRinvnid:
-                        return -EBADSLT;
-                default:
-                        class = "ERRSRV";
-                        goto err_unknown;
-                }
-        } else if (errcls == ERRHRD) {
-                switch (error) {
-                case ERRnowrite:
-                        return -EROFS;
-                case ERRbadunit:
-                        return -ENODEV;
-                case ERRnotready:
-                        return -EUCLEAN;
-                case ERRbadcmd:
-                case ERRdata:
-                        return -EIO;
-                case ERRbadreq:
-                        return -ERANGE;
-                case ERRbadshare:
-                        return -ETXTBSY;
-                case ERRlock:
-                        return -EDEADLK;
-                case ERRdiskfull:
-                        return -ENOSPC;
-                default:
-                        class = "ERRHRD";
-                        goto err_unknown;
-                }
-        } else if (errcls == ERRCMD) {
-                class = "ERRCMD";
-        } else if (errcls == SUCCESS) {
-                return 0;       /* This is the only valid 0 return */
-        }
-err_unknown:
-        printk(KERN_ERR "smb_errno: class %s, code %d from command 0x%x\n",
-               class, error, SMB_CMD(req->rq_header));
-        return -EIO;
-}
-/* smb_request_ok: We expect the server to be locked. Then we do the
-   request and check the answer completely. When smb_request_ok
-   returns 0, you can be quite sure that everything went well. When
-   the answer is <=0, the returned number is a valid unix errno. */
-static int
-smb_request_ok(struct smb_request *req, int command, int wct, int bcc)
-{
-        int result;
-        req->rq_resp_wct = wct;
-        req->rq_resp_bcc = bcc;
-        result = smb_add_request(req);
-        if (result != 0) {
-                DEBUG1("smb_request failed\n");
-                goto out;
-        }
-        if (smb_valid_packet(req->rq_header) != 0) {
-                PARANOIA("invalid packet!\n");
-                goto out;
-        }
-        result = smb_verify(req->rq_header, command, wct, bcc);
-out:
-        return result;
-}
-/*
- * This implements the NEWCONN ioctl. It installs the server pid,
- * sets server->state to CONN_VALID, and wakes up the waiting process.
- */
-int
-smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt)
-{
-        struct file *filp;
-        struct sock *sk;
-        int error;
-        VERBOSE("fd=%d, pid=%d\n", opt->fd, current->pid);
-        smb_lock_server(server);
-        /*
-         * Make sure we don't already have a valid connection ...
-         */
-        error = -EINVAL;
-        if (server->state == CONN_VALID)
-                goto out;
-        error = -EACCES;
-        if (current_uid() != server->mnt->mounted_uid &&
-            !capable(CAP_SYS_ADMIN))
-                goto out;
-        error = -EBADF;
-        filp = fget(opt->fd);
-        if (!filp)
-                goto out;
-        if (!smb_valid_socket(filp->f_path.dentry->d_inode))
-                goto out_putf;
-        server->sock_file = filp;
-        server->conn_pid = get_pid(task_pid(current));
-        server->opt = *opt;
-        server->generation += 1;
-        server->state = CONN_VALID;
-        error = 0;
-        if (server->conn_error) {
-                /*
-                 * conn_error is the returncode we originally decided to
-                 * drop the old connection on. This message should be positive
-                 * and not make people ask questions on why smbfs is printing
-                 * error messages ...
-                 */
-                printk(KERN_INFO "SMB connection re-established (%d)\n",
-                       server->conn_error);
-                server->conn_error = 0;
-        }
-        /*
-         * Store the server in sock user_data (Only used by sunrpc)
-         */
-        sk = SOCKET_I(filp->f_path.dentry->d_inode)->sk;
-        sk->sk_user_data = server;
-        /* chain into the data_ready callback */
-        server->data_ready = xchg(&sk->sk_data_ready, smb_data_ready);
-        /* check if we have an old smbmount that uses seconds for the 
-           serverzone */
-        if (server->opt.serverzone > 12*60 || server->opt.serverzone < -12*60)
-                server->opt.serverzone /= 60;
-        /* now that we have an established connection we can detect the server
-           type and enable bug workarounds */
-        if (server->opt.protocol < SMB_PROTOCOL_LANMAN2)
-                install_ops(server->ops, &smb_ops_core);
-        else if (server->opt.protocol == SMB_PROTOCOL_LANMAN2)
-                install_ops(server->ops, &smb_ops_os2);
-        else if (server->opt.protocol == SMB_PROTOCOL_NT1 &&
-                 (server->opt.max_xmit < 0x1000) &&
-                 !(server->opt.capabilities & SMB_CAP_NT_SMBS)) {
-                /* FIXME: can we kill the WIN95 flag now? */
-                server->mnt->flags |= SMB_MOUNT_WIN95;
-                VERBOSE("detected WIN95 server\n");
-                install_ops(server->ops, &smb_ops_win95);
-        } else {
-                /*
-                 * Samba has max_xmit 65535
-                 * NT4spX has max_xmit 4536 (or something like that)
-                 * win2k has ...
-                 */
-                VERBOSE("detected NT1 (Samba, NT4/5) server\n");
-                install_ops(server->ops, &smb_ops_winNT);
-        }
-        /* FIXME: the win9x code wants to modify these ... (seek/trunc bug) */
-        if (server->mnt->flags & SMB_MOUNT_OLDATTR) {
-                server->ops->getattr = smb_proc_getattr_core;
-        } else if (server->mnt->flags & SMB_MOUNT_DIRATTR) {
-                server->ops->getattr = smb_proc_getattr_ff;
-        }
-        /* Decode server capabilities */
-        if (server->opt.capabilities & SMB_CAP_LARGE_FILES) {
-                /* Should be ok to set this now, as no one can access the
-                   mount until the connection has been established. */
-                SB_of(server)->s_maxbytes = ~0ULL >> 1;
-                VERBOSE("LFS enabled\n");
-        }
-        if (server->opt.capabilities & SMB_CAP_UNICODE) {
-                server->mnt->flags |= SMB_MOUNT_UNICODE;
-                VERBOSE("Unicode enabled\n");
-        } else {
-                server->mnt->flags &= ~SMB_MOUNT_UNICODE;
-        }
-#if 0
-        /* flags we may test for other patches ... */
-        if (server->opt.capabilities & SMB_CAP_LARGE_READX) {
-                VERBOSE("Large reads enabled\n");
-        }
-        if (server->opt.capabilities & SMB_CAP_LARGE_WRITEX) {
-                VERBOSE("Large writes enabled\n");
-        }
-#endif
-        if (server->opt.capabilities & SMB_CAP_UNIX) {
-                struct inode *inode;
-                VERBOSE("Using UNIX CIFS extensions\n");
-                install_ops(server->ops, &smb_ops_unix);
-                inode = SB_of(server)->s_root->d_inode;
-                if (inode)
-                        inode->i_op = &smb_dir_inode_operations_unix;
-        }
-        VERBOSE("protocol=%d, max_xmit=%d, pid=%d capabilities=0x%x\n",
-                server->opt.protocol, server->opt.max_xmit,
-                pid_nr(server->conn_pid), server->opt.capabilities);
-        /* FIXME: this really should be done by smbmount. */
-        if (server->opt.max_xmit > SMB_MAX_PACKET_SIZE) {
-                server->opt.max_xmit = SMB_MAX_PACKET_SIZE;
-        }
-        smb_unlock_server(server);
-        smbiod_wake_up();
-        if (server->opt.capabilities & SMB_CAP_UNIX)
-                smb_proc_query_cifsunix(server);
-        server->conn_complete++;
-        wake_up_interruptible_all(&server->conn_wq);
-        return error;
-out:
-        smb_unlock_server(server);
-        smbiod_wake_up();
-        return error;
-out_putf:
-        fput(filp);
-        goto out;
-}
-/* smb_setup_header: We completely set up the packet. You only have to
-   insert the command-specific fields */
-__u8 *
-smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc)
-{
-        __u32 xmit_len = SMB_HEADER_LEN + wct * sizeof(__u16) + bcc + 2;
-        __u8 *p = req->rq_header;
-        struct smb_sb_info *server = req->rq_server;
-        p = smb_encode_smb_length(p, xmit_len - 4);
-        *p++ = 0xff;
-        *p++ = 'S';
-        *p++ = 'M';
-        *p++ = 'B';
-        *p++ = command;
-        memset(p, '\0', 19);
-        p += 19;
-        p += 8;
-        if (server->opt.protocol > SMB_PROTOCOL_CORE) {
-                int flags = SMB_FLAGS_CASELESS_PATHNAMES;
-                int flags2 = SMB_FLAGS2_LONG_PATH_COMPONENTS |
-                        SMB_FLAGS2_EXTENDED_ATTRIBUTES; /* EA? not really ... */
-                *(req->rq_header + smb_flg) = flags;
-                if (server->mnt->flags & SMB_MOUNT_UNICODE)
-                        flags2 |= SMB_FLAGS2_UNICODE_STRINGS;
-                WSET(req->rq_header, smb_flg2, flags2);
-        }
-        *p++ = wct;             /* wct */
-        p += 2 * wct;
-        WSET(p, 0, bcc);
-        /* Include the header in the data to send */
-        req->rq_iovlen = 1;
-        req->rq_iov[0].iov_base = req->rq_header;
-        req->rq_iov[0].iov_len  = xmit_len - bcc;
-        return req->rq_buffer;
-}
-static void
-smb_setup_bcc(struct smb_request *req, __u8 *p)
-{
-        u16 bcc = p - req->rq_buffer;
-        u8 *pbcc = req->rq_header + SMB_HEADER_LEN + 2*SMB_WCT(req->rq_header);
-        WSET(pbcc, 0, bcc);
-        smb_encode_smb_length(req->rq_header, SMB_HEADER_LEN + 
-                              2*SMB_WCT(req->rq_header) - 2 + bcc);
-        /* Include the "bytes" in the data to send */
-        req->rq_iovlen = 2;
-        req->rq_iov[1].iov_base = req->rq_buffer;
-        req->rq_iov[1].iov_len  = bcc;
-}
-static int
-smb_proc_seek(struct smb_sb_info *server, __u16 fileid,
-              __u16 mode, off_t offset)
-{
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBlseek, 4, 0);
-        WSET(req->rq_header, smb_vwv0, fileid);
-        WSET(req->rq_header, smb_vwv1, mode);
-        DSET(req->rq_header, smb_vwv2, offset);
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBlseek, 2, 0);
-        if (result < 0) {
-                result = 0;
-                goto out_free;
-        }
-        result = DVAL(req->rq_header, smb_vwv0);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_open(struct smb_sb_info *server, struct dentry *dentry, int wish)
-{
-        struct inode *ino = dentry->d_inode;
-        struct smb_inode_info *ei = SMB_I(ino);
-        int mode, read_write = 0x42, read_only = 0x40;
-        int res;
-        char *p;
-        struct smb_request *req;
-        /*
-         * Attempt to open r/w, unless there are no write privileges.
-         */
-        mode = read_write;
-        if (!(ino->i_mode & (S_IWUSR | S_IWGRP | S_IWOTH)))
-                mode = read_only;
-#if 0
-        /* FIXME: why is this code not in? below we fix it so that a caller
-           wanting RO doesn't get RW. smb_revalidate_inode does some 
-           optimization based on access mode. tail -f needs it to be correct.
-           We must open rw since we don't do the open if called a second time
-           with different 'wish'. Is that not supported by smb servers? */
-        if (!(wish & (O_WRONLY | O_RDWR)))
-                mode = read_only;
-#endif
-        res = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-      retry:
-        p = smb_setup_header(req, SMBopen, 2, 0);
-        WSET(req->rq_header, smb_vwv0, mode);
-        WSET(req->rq_header, smb_vwv1, aSYSTEM | aHIDDEN | aDIR);
-        res = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (res < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        res = smb_request_ok(req, SMBopen, 7, 0);
-        if (res != 0) {
-                if (mode == read_write &&
-                    (res == -EACCES || res == -ETXTBSY || res == -EROFS))
-                {
-                        VERBOSE("%s/%s R/W failed, error=%d, retrying R/O\n",
-                                DENTRY_PATH(dentry), res);
-                        mode = read_only;
-                        req->rq_flags = 0;
-                        goto retry;
-                }
-                goto out_free;
-        }
-        /* We should now have data in vwv[0..6]. */
-        ei->fileid = WVAL(req->rq_header, smb_vwv0);
-        ei->attr   = WVAL(req->rq_header, smb_vwv1);
-        /* smb_vwv2 has mtime */
-        /* smb_vwv4 has size  */
-        ei->access = (WVAL(req->rq_header, smb_vwv6) & SMB_ACCMASK);
-        ei->open = server->generation;
-out_free:
-        smb_rput(req);
-out:
-        return res;
-}
-/*
- * Make sure the file is open, and check that the access
- * is compatible with the desired access.
- */
-int
-smb_open(struct dentry *dentry, int wish)
-{
-        struct inode *inode = dentry->d_inode;
-        int result;
-        __u16 access;
-        result = -ENOENT;
-        if (!inode) {
-                printk(KERN_ERR "smb_open: no inode for dentry %s/%s\n",
-                       DENTRY_PATH(dentry));
-                goto out;
-        }
-        if (!smb_is_open(inode)) {
-                struct smb_sb_info *server = server_from_inode(inode);
-                result = 0;
-                if (!smb_is_open(inode))
-                        result = smb_proc_open(server, dentry, wish);
-                if (result)
-                        goto out;
-                /*
-                 * A successful open means the path is still valid ...
-                 */
-                smb_renew_times(dentry);
-        }
-        /*
-         * Check whether the access is compatible with the desired mode.
-         */
-        result = 0;
-        access = SMB_I(inode)->access;
-        if (access != wish && access != SMB_O_RDWR) {
-                PARANOIA("%s/%s access denied, access=%x, wish=%x\n",
-                         DENTRY_PATH(dentry), access, wish);
-                result = -EACCES;
-        }
-out:
-        return result;
-}
-static int 
-smb_proc_close(struct smb_sb_info *server, __u16 fileid, __u32 mtime)
-{
-        struct smb_request *req;
-        int result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBclose, 3, 0);
-        WSET(req->rq_header, smb_vwv0, fileid);
-        DSET(req->rq_header, smb_vwv1, utc2local(server, mtime));
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBclose, 0, 0);
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Win NT 4.0 has an apparent bug in that it fails to update the
- * modify time when writing to a file. As a workaround, we update
- * both modify and access time locally, and post the times to the
- * server when closing the file.
- */
-static int 
-smb_proc_close_inode(struct smb_sb_info *server, struct inode * ino)
-{
-        struct smb_inode_info *ei = SMB_I(ino);
-        int result = 0;
-        if (smb_is_open(ino))
-        {
-                /*
-                 * We clear the open flag in advance, in case another
-                 * process observes the value while we block below.
-                 */
-                ei->open = 0;
-                /*
-                 * Kludge alert: SMB timestamps are accurate only to
-                 * two seconds ... round the times to avoid needless
-                 * cache invalidations!
-                 */
-                if (ino->i_mtime.tv_sec & 1) { 
-                        ino->i_mtime.tv_sec--;
-                        ino->i_mtime.tv_nsec = 0; 
-                }
-                if (ino->i_atime.tv_sec & 1) {
-                        ino->i_atime.tv_sec--;
-                        ino->i_atime.tv_nsec = 0;
-                }
-                /*
-                 * If the file is open with write permissions,
-                 * update the time stamps to sync mtime and atime.
-                 */
-                if ((server->opt.capabilities & SMB_CAP_UNIX) == 0 &&
-                    (server->opt.protocol >= SMB_PROTOCOL_LANMAN2) &&
-                    !(ei->access == SMB_O_RDONLY))
-                {
-                        struct smb_fattr fattr;
-                        smb_get_inode_attr(ino, &fattr);
-                        smb_proc_setattr_ext(server, ino, &fattr);
-                }
-                result = smb_proc_close(server, ei->fileid, ino->i_mtime.tv_sec);
-                /*
-                 * Force a revalidation after closing ... some servers
-                 * don't post the size until the file has been closed.
-                 */
-                if (server->opt.protocol < SMB_PROTOCOL_NT1)
-                        ei->oldmtime = 0;
-                ei->closed = jiffies;
-        }
-        return result;
-}
-int
-smb_close(struct inode *ino)
-{
-        int result = 0;
-        if (smb_is_open(ino)) {
-                struct smb_sb_info *server = server_from_inode(ino);
-                result = smb_proc_close_inode(server, ino);
-        }
-        return result;
-}
-/*
- * This is used to close a file following a failed instantiate.
- * Since we don't have an inode, we can't use any of the above.
- */
-int
-smb_close_fileid(struct dentry *dentry, __u16 fileid)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        int result;
-        result = smb_proc_close(server, fileid, get_seconds());
-        return result;
-}
-/* In smb_proc_read and smb_proc_write we do not retry, because the
-   file-id would not be valid after a reconnection. */
-static void
-smb_proc_read_data(struct smb_request *req)
-{
-        req->rq_iov[0].iov_base = req->rq_buffer;
-        req->rq_iov[0].iov_len  = 3;
-        req->rq_iov[1].iov_base = req->rq_page;
-        req->rq_iov[1].iov_len  = req->rq_rsize;
-        req->rq_iovlen = 2;
-        req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-}
-static int
-smb_proc_read(struct inode *inode, loff_t offset, int count, char *data)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        __u16 returned_count, data_len;
-        unsigned char *buf;
-        int result;
-        struct smb_request *req;
-        u8 rbuf[4];
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBread, 5, 0);
-        buf = req->rq_header;
-        WSET(buf, smb_vwv0, SMB_I(inode)->fileid);
-        WSET(buf, smb_vwv1, count);
-        DSET(buf, smb_vwv2, offset);
-        WSET(buf, smb_vwv4, 0);
-        req->rq_page = data;
-        req->rq_rsize = count;
-        req->rq_callback = smb_proc_read_data;
-        req->rq_buffer = rbuf;
-        req->rq_flags |= SMB_REQ_NORETRY | SMB_REQ_STATIC;
-        result = smb_request_ok(req, SMBread, 5, -1);
-        if (result < 0)
-                goto out_free;
-        returned_count = WVAL(req->rq_header, smb_vwv0);
-        data_len = WVAL(rbuf, 1);
-        if (returned_count != data_len) {
-                printk(KERN_NOTICE "smb_proc_read: returned != data_len\n");
-                printk(KERN_NOTICE "smb_proc_read: ret_c=%d, data_len=%d\n",
-                       returned_count, data_len);
-        }
-        result = data_len;
-out_free:
-        smb_rput(req);
-out:
-        VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
-                inode->i_ino, SMB_I(inode)->fileid, count, result);
-        return result;
-}
-static int
-smb_proc_write(struct inode *inode, loff_t offset, int count, const char *data)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        int result;
-        u16 fileid = SMB_I(inode)->fileid;
-        u8 buf[4];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
-                inode->i_ino, fileid, count, offset);
-        smb_setup_header(req, SMBwrite, 5, count + 3);
-        WSET(req->rq_header, smb_vwv0, fileid);
-        WSET(req->rq_header, smb_vwv1, count);
-        DSET(req->rq_header, smb_vwv2, offset);
-        WSET(req->rq_header, smb_vwv4, 0);
-        buf[0] = 1;
-        WSET(buf, 1, count);    /* yes, again ... */
-        req->rq_iov[1].iov_base = buf;
-        req->rq_iov[1].iov_len = 3;
-        req->rq_iov[2].iov_base = (char *) data;
-        req->rq_iov[2].iov_len = count;
-        req->rq_iovlen = 3;
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBwrite, 1, 0);
-        if (result >= 0)
-                result = WVAL(req->rq_header, smb_vwv0);
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * In smb_proc_readX and smb_proc_writeX we do not retry, because the
- * file-id would not be valid after a reconnection.
- */
-#define SMB_READX_MAX_PAD      64
-static void
-smb_proc_readX_data(struct smb_request *req)
-{
-        /* header length, excluding the netbios length (-4) */
-        int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
-        int data_off = WVAL(req->rq_header, smb_vwv6);
-        /*
-         * Some genius made the padding to the data bytes arbitrary.
-         * So we must first calculate the amount of padding used by the server.
-         */
-        data_off -= hdrlen;
-        if (data_off > SMB_READX_MAX_PAD || data_off < 0) {
-                PARANOIA("offset is larger than SMB_READX_MAX_PAD or negative!\n");
-                PARANOIA("%d > %d || %d < 0\n", data_off, SMB_READX_MAX_PAD, data_off);
-                req->rq_rlen = req->rq_bufsize + 1;
-                return;
-        }
-        req->rq_iov[0].iov_base = req->rq_buffer;
-        req->rq_iov[0].iov_len  = data_off;
-        req->rq_iov[1].iov_base = req->rq_page;
-        req->rq_iov[1].iov_len  = req->rq_rsize;
-        req->rq_iovlen = 2;
-        req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-}
-static int
-smb_proc_readX(struct inode *inode, loff_t offset, int count, char *data)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        unsigned char *buf;
-        int result;
-        struct smb_request *req;
-        static char pad[SMB_READX_MAX_PAD];
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBreadX, 12, 0);
-        buf = req->rq_header;
-        WSET(buf, smb_vwv0, 0x00ff);
-        WSET(buf, smb_vwv1, 0);
-        WSET(buf, smb_vwv2, SMB_I(inode)->fileid);
-        DSET(buf, smb_vwv3, (u32)offset);               /* low 32 bits */
-        WSET(buf, smb_vwv5, count);
-        WSET(buf, smb_vwv6, 0);
-        DSET(buf, smb_vwv7, 0);
-        WSET(buf, smb_vwv9, 0);
-        DSET(buf, smb_vwv10, (u32)(offset >> 32));      /* high 32 bits */
-        WSET(buf, smb_vwv11, 0);
-        req->rq_page = data;
-        req->rq_rsize = count;
-        req->rq_callback = smb_proc_readX_data;
-        req->rq_buffer = pad;
-        req->rq_bufsize = SMB_READX_MAX_PAD;
-        req->rq_flags |= SMB_REQ_STATIC | SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBreadX, 12, -1);
-        if (result < 0)
-                goto out_free;
-        result = WVAL(req->rq_header, smb_vwv5);
-out_free:
-        smb_rput(req);
-out:
-        VERBOSE("ino=%ld, fileid=%d, count=%d, result=%d\n",
-                inode->i_ino, SMB_I(inode)->fileid, count, result);
-        return result;
-}
-static int
-smb_proc_writeX(struct inode *inode, loff_t offset, int count, const char *data)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        int result;
-        u8 *p;
-        static u8 pad[4];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        VERBOSE("ino=%ld, fileid=%d, count=%d@%Ld\n",
-                inode->i_ino, SMB_I(inode)->fileid, count, offset);
-        p = smb_setup_header(req, SMBwriteX, 14, count + 1);
-        WSET(req->rq_header, smb_vwv0, 0x00ff);
-        WSET(req->rq_header, smb_vwv1, 0);
-        WSET(req->rq_header, smb_vwv2, SMB_I(inode)->fileid);
-        DSET(req->rq_header, smb_vwv3, (u32)offset);    /* low 32 bits */
-        DSET(req->rq_header, smb_vwv5, 0);
-        WSET(req->rq_header, smb_vwv7, 0);              /* write mode */
-        WSET(req->rq_header, smb_vwv8, 0);
-        WSET(req->rq_header, smb_vwv9, 0);
-        WSET(req->rq_header, smb_vwv10, count);         /* data length */
-        WSET(req->rq_header, smb_vwv11, smb_vwv12 + 2 + 1);
-        DSET(req->rq_header, smb_vwv12, (u32)(offset >> 32));
-        req->rq_iov[1].iov_base = pad;
-        req->rq_iov[1].iov_len = 1;
-        req->rq_iov[2].iov_base = (char *) data;
-        req->rq_iov[2].iov_len = count;
-        req->rq_iovlen = 3;
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBwriteX, 6, 0);
-        if (result >= 0)
-                result = WVAL(req->rq_header, smb_vwv2);
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, SMBcreate, 3, 0);
-        WSET(req->rq_header, smb_vwv0, attr);
-        DSET(req->rq_header, smb_vwv1, utc2local(server, ctime));
-        result = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        result = smb_request_ok(req, SMBcreate, 1, 0);
-        if (result < 0)
-                goto out_free;
-        *fileid = WVAL(req->rq_header, smb_vwv0);
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry)
-{
-        struct smb_sb_info *server = server_from_dentry(old_dentry);
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, SMBmv, 1, 0);
-        WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN | aDIR);
-        result = smb_simple_encode_path(req, &p, old_dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        result = smb_simple_encode_path(req, &p, new_dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        if ((result = smb_request_ok(req, SMBmv, 0, 0)) < 0)
-                goto out_free;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Code common to mkdir and rmdir.
- */
-static int
-smb_proc_generic_command(struct dentry *dentry, __u8 command)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, command, 0, 0);
-        result = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        result = smb_request_ok(req, command, 0, 0);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_mkdir(struct dentry *dentry)
-{
-        return smb_proc_generic_command(dentry, SMBmkdir);
-}
-int
-smb_proc_rmdir(struct dentry *dentry)
-{
-        return smb_proc_generic_command(dentry, SMBrmdir);
-}
-#if SMBFS_POSIX_UNLINK
-/*
- * Removes readonly attribute from a file. Used by unlink to give posix
- * semantics.
- */
-static int
-smb_set_rw(struct dentry *dentry,struct smb_sb_info *server)
-{
-        int result;
-        struct smb_fattr fattr;
-        /* FIXME: cifsUE should allow removing a readonly file. */
-        /* first get current attribute */
-        smb_init_dirent(server, &fattr);
-        result = server->ops->getattr(server, dentry, &fattr);
-        smb_finish_dirent(server, &fattr);
-        if (result < 0)
-                return result;
-        /* if RONLY attribute is set, remove it */
-        if (fattr.attr & aRONLY) {  /* read only attribute is set */
-                fattr.attr &= ~aRONLY;
-                result = smb_proc_setattr_core(server, dentry, fattr.attr);
-        }
-        return result;
-}
-#endif
-int
-smb_proc_unlink(struct dentry *dentry)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        int flag = 0;
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-      retry:
-        p = smb_setup_header(req, SMBunlink, 1, 0);
-        WSET(req->rq_header, smb_vwv0, aSYSTEM | aHIDDEN);
-        result = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        if ((result = smb_request_ok(req, SMBunlink, 0, 0)) < 0) {
-#if SMBFS_POSIX_UNLINK
-                if (result == -EACCES && !flag) {
-                        /* Posix semantics is for the read-only state
-                           of a file to be ignored in unlink(). In the
-                           SMB world a unlink() is refused on a
-                           read-only file. To make things easier for
-                           unix users we try to override the files
-                           permission if the unlink fails with the
-                           right error.
-                           This introduces a race condition that could
-                           lead to a file being written by someone who
-                           shouldn't have access, but as far as I can
-                           tell that is unavoidable */
-                        /* remove RONLY attribute and try again */
-                        result = smb_set_rw(dentry,server);
-                        if (result == 0) {
-                                flag = 1;
-                                req->rq_flags = 0;
-                                goto retry;
-                        }
-                }
-#endif
-                goto out_free;
-        }
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_flush(struct smb_sb_info *server, __u16 fileid)
-{
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBflush, 1, 0);
-        WSET(req->rq_header, smb_vwv0, fileid);
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBflush, 0, 0);
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_trunc32(struct inode *inode, loff_t length)
-{
-        /*
-         * Writing 0bytes is old-SMB magic for truncating files.
-         * MAX_NON_LFS should prevent this from being called with a too
-         * large offset.
-         */
-        return smb_proc_write(inode, length, 0, NULL);
-}
-static int
-smb_proc_trunc64(struct inode *inode, loff_t length)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        int result;
-        char *param;
-        char *data;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 14)))
-                goto out;
-        param = req->rq_buffer;
-        data = req->rq_buffer + 6;
-        /* FIXME: must we also set allocation size? winNT seems to do that */
-        WSET(param, 0, SMB_I(inode)->fileid);
-        WSET(param, 2, SMB_SET_FILE_END_OF_FILE_INFO);
-        WSET(param, 4, 0);
-        LSET(data, 0, length);
-        req->rq_trans2_command = TRANSACT2_SETFILEINFO;
-        req->rq_ldata = 8;
-        req->rq_data  = data;
-        req->rq_lparm = 6;
-        req->rq_parm  = param;
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-        if (req->rq_rcls != 0)
-                result = smb_errno(req);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_trunc95(struct inode *inode, loff_t length)
-{
-        struct smb_sb_info *server = server_from_inode(inode);
-        int result = smb_proc_trunc32(inode, length);
- 
-        /*
-         * win9x doesn't appear to update the size immediately.
-         * It will return the old file size after the truncate,
-         * confusing smbfs. So we force an update.
-         *
-         * FIXME: is this still necessary?
-         */
-        smb_proc_flush(server, SMB_I(inode)->fileid);
-        return result;
-}
-static void
-smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
-{
-        memset(fattr, 0, sizeof(*fattr));
-        fattr->f_nlink = 1;
-        fattr->f_uid = server->mnt->uid;
-        fattr->f_gid = server->mnt->gid;
-        fattr->f_unix = 0;
-}
-static void
-smb_finish_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
-{
-        if (fattr->f_unix)
-                return;
-        fattr->f_mode = server->mnt->file_mode;
-        if (fattr->attr & aDIR) {
-                fattr->f_mode = server->mnt->dir_mode;
-                fattr->f_size = SMB_ST_BLKSIZE;
-        }
-        /* Check the read-only flag */
-        if (fattr->attr & aRONLY)
-                fattr->f_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH);
-        /* How many 512 byte blocks do we need for this file? */
-        fattr->f_blocks = 0;
-        if (fattr->f_size != 0)
-                fattr->f_blocks = 1 + ((fattr->f_size-1) >> 9);
-        return;
-}
-void
-smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
-                     struct super_block *sb)
-{
-        smb_init_dirent(server, fattr);
-        fattr->attr = aDIR;
-        fattr->f_ino = 2; /* traditional root inode number */
-        fattr->f_mtime = current_fs_time(sb);
-        smb_finish_dirent(server, fattr);
-}
-/*
- * Decode a dirent for old protocols
- *
- * qname is filled with the decoded, and possibly translated, name.
- * fattr receives decoded attributes
- *
- * Bugs Noted:
- * (1) Pathworks servers may pad the name with extra spaces.
- */
-static char *
-smb_decode_short_dirent(struct smb_sb_info *server, char *p,
-                        struct qstr *qname, struct smb_fattr *fattr,
-                        unsigned char *name_buf)
-{
-        int len;
-        /*
-         * SMB doesn't have a concept of inode numbers ...
-         */
-        smb_init_dirent(server, fattr);
-        fattr->f_ino = 0;       /* FIXME: do we need this? */
-        p += SMB_STATUS_SIZE;   /* reserved (search_status) */
-        fattr->attr = *p;
-        fattr->f_mtime.tv_sec = date_dos2unix(server, WVAL(p, 3), WVAL(p, 1));
-        fattr->f_mtime.tv_nsec = 0;
-        fattr->f_size = DVAL(p, 5);
-        fattr->f_ctime = fattr->f_mtime;
-        fattr->f_atime = fattr->f_mtime;
-        qname->name = p + 9;
-        len = strnlen(qname->name, 12);
-        /*
-         * Trim trailing blanks for Pathworks servers
-         */
-        while (len > 2 && qname->name[len-1] == ' ')
-                len--;
-        smb_finish_dirent(server, fattr);
-#if 0
-        /* FIXME: These only work for ascii chars, and recent smbmount doesn't
-           allow the flag to be set anyway. It kills const. Remove? */
-        switch (server->opt.case_handling) {
-        case SMB_CASE_UPPER:
-                str_upper(entry->name, len);
-                break;
-        case SMB_CASE_LOWER:
-                str_lower(entry->name, len);
-                break;
-        default:
-                break;
-        }
-#endif
-        qname->len = 0;
-        len = server->ops->convert(name_buf, SMB_MAXNAMELEN,
-                                   qname->name, len,
-                                   server->remote_nls, server->local_nls);
-        if (len > 0) {
-                qname->len = len;
-                qname->name = name_buf;
-                DEBUG1("len=%d, name=%.*s\n",qname->len,qname->len,qname->name);
-        }
-        return p + 22;
-}
-/*
- * This routine is used to read in directory entries from the network.
- * Note that it is for short directory name seeks, i.e.: protocol <
- * SMB_PROTOCOL_LANMAN2
- */
-static int
-smb_proc_readdir_short(struct file *filp, void *dirent, filldir_t filldir,
-                       struct smb_cache_control *ctl)
-{
-        struct dentry *dir = filp->f_path.dentry;
-        struct smb_sb_info *server = server_from_dentry(dir);
-        struct qstr qname;
-        struct smb_fattr fattr;
-        char *p;
-        int result;
-        int i, first, entries_seen, entries;
-        int entries_asked = (server->opt.max_xmit - 100) / SMB_DIRINFO_SIZE;
-        __u16 bcc;
-        __u16 count;
-        char status[SMB_STATUS_SIZE];
-        static struct qstr mask = {
-                .name   = "*.*",
-                .len    = 3,
-        };
-        unsigned char *last_status;
-        struct smb_request *req;
-        unsigned char *name_buf;
-        VERBOSE("%s/%s\n", DENTRY_PATH(dir));
-        lock_kernel();
-        result = -ENOMEM;
-        if (! (name_buf = kmalloc(SMB_MAXNAMELEN, GFP_KERNEL)))
-                goto out;
-        first = 1;
-        entries = 0;
-        entries_seen = 2; /* implicit . and .. */
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
-                goto out_name;
-        while (1) {
-                p = smb_setup_header(req, SMBsearch, 2, 0);
-                WSET(req->rq_header, smb_vwv0, entries_asked);
-                WSET(req->rq_header, smb_vwv1, aDIR);
-                if (first == 1) {
-                        result = smb_simple_encode_path(req, &p, dir, &mask);
-                        if (result < 0)
-                                goto out_free;
-                        if (p + 3 > (char *)req->rq_buffer + req->rq_bufsize) {
-                                result = -ENAMETOOLONG;
-                                goto out_free;
-                        }
-                        *p++ = 5;
-                        WSET(p, 0, 0);
-                        p += 2;
-                        first = 0;
-                } else {
-                        if (p + 5 + SMB_STATUS_SIZE >
-                            (char *)req->rq_buffer + req->rq_bufsize) {
-                                result = -ENAMETOOLONG;
-                                goto out_free;
-                        }
-                                
-                        *p++ = 4;
-                        *p++ = 0;
-                        *p++ = 5;
-                        WSET(p, 0, SMB_STATUS_SIZE);
-                        p += 2;
-                        memcpy(p, status, SMB_STATUS_SIZE);
-                        p += SMB_STATUS_SIZE;
-                }
-                smb_setup_bcc(req, p);
-                result = smb_request_ok(req, SMBsearch, 1, -1);
-                if (result < 0) {
-                        if ((req->rq_rcls == ERRDOS) && 
-                            (req->rq_err  == ERRnofiles))
-                                break;
-                        goto out_free;
-                }
-                count = WVAL(req->rq_header, smb_vwv0);
-                if (count <= 0)
-                        break;
-                result = -EIO;
-                bcc = smb_bcc(req->rq_header);
-                if (bcc != count * SMB_DIRINFO_SIZE + 3)
-                        goto out_free;
-                p = req->rq_buffer + 3;
-                /* Make sure the response fits in the buffer. Fixed sized 
-                   entries means we don't have to check in the decode loop. */
-                last_status = req->rq_buffer + 3 + (count-1) * SMB_DIRINFO_SIZE;
-                if (last_status + SMB_DIRINFO_SIZE >=
-                    req->rq_buffer + req->rq_bufsize) {
-                        printk(KERN_ERR "smb_proc_readdir_short: "
-                               "last dir entry outside buffer! "
-                               "%d@%p  %d@%p\n", SMB_DIRINFO_SIZE, last_status,
-                               req->rq_bufsize, req->rq_buffer);
-                        goto out_free;
-                }
-                /* Read the last entry into the status field. */
-                memcpy(status, last_status, SMB_STATUS_SIZE);
-                /* Now we are ready to parse smb directory entries. */
-                for (i = 0; i < count; i++) {
-                        p = smb_decode_short_dirent(server, p, 
-                                                    &qname, &fattr, name_buf);
-                        if (qname.len == 0)
-                                continue;
-                        if (entries_seen == 2 && qname.name[0] == '.') {
-                                if (qname.len == 1)
-                                        continue;
-                                if (qname.name[1] == '.' && qname.len == 2)
-                                        continue;
-                        }
-                        if (!smb_fill_cache(filp, dirent, filldir, ctl, 
-                                            &qname, &fattr))
-                                ;       /* stop reading? */
-                        entries_seen++;
-                }
-        }
-        result = entries;
-out_free:
-        smb_rput(req);
-out_name:
-        kfree(name_buf);
-out:
-        unlock_kernel();
-        return result;
-}
-static void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p)
-{
-        u64 size, disk_bytes;
-        /* FIXME: verify nls support. all is sent as utf8? */
-        fattr->f_unix = 1;
-        fattr->f_mode = 0;
-        /* FIXME: use the uniqueID from the remote instead? */
-        /* 0 L file size in bytes */
-        /* 8 L file size on disk in bytes (block count) */
-        /* 40 L uid */
-        /* 48 L gid */
-        /* 56 W file type */
-        /* 60 L devmajor */
-        /* 68 L devminor */
-        /* 76 L unique ID (inode) */
-        /* 84 L permissions */
-        /* 92 L link count */
-        size = LVAL(p, 0);
-        disk_bytes = LVAL(p, 8);
-        /*
-         * Some samba versions round up on-disk byte usage
-         * to 1MB boundaries, making it useless. When seeing
-         * that, use the size instead.
-         */
-        if (!(disk_bytes & 0xfffff))
-                disk_bytes = size+511;
-        fattr->f_size = size;
-        fattr->f_blocks = disk_bytes >> 9;
-        fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 16));
-        fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 24));
-        fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 32));
-        if (server->mnt->flags & SMB_MOUNT_UID)
-                fattr->f_uid = server->mnt->uid;
-        else
-                fattr->f_uid = LVAL(p, 40);
-        if (server->mnt->flags & SMB_MOUNT_GID)
-                fattr->f_gid = server->mnt->gid;
-        else
-                fattr->f_gid = LVAL(p, 48);
-        fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56));
-        if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) {
-                __u64 major = LVAL(p, 60);
-                __u64 minor = LVAL(p, 68);
-                fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff);
-                if (MAJOR(fattr->f_rdev) != (major & 0xffffffff) ||
-                MINOR(fattr->f_rdev) != (minor & 0xffffffff))
-                        fattr->f_rdev = 0;
-        }
-        fattr->f_mode |= LVAL(p, 84);
-        if ( (server->mnt->flags & SMB_MOUNT_DMODE) &&
-             (S_ISDIR(fattr->f_mode)) )
-                fattr->f_mode = (server->mnt->dir_mode & S_IRWXUGO) | S_IFDIR;
-        else if ( (server->mnt->flags & SMB_MOUNT_FMODE) &&
-                  !(S_ISDIR(fattr->f_mode)) )
-                fattr->f_mode = (server->mnt->file_mode & S_IRWXUGO) |
-                                (fattr->f_mode & S_IFMT);
-}
-/*
- * Interpret a long filename structure using the specified info level:
- *   level 1 for anything below NT1 protocol
- *   level 260 for NT1 protocol
- *
- * qname is filled with the decoded, and possibly translated, name
- * fattr receives decoded attributes.
- *
- * Bugs Noted:
- * (1) Win NT 4.0 appends a null byte to names and counts it in the length!
- */
-static char *
-smb_decode_long_dirent(struct smb_sb_info *server, char *p, int level,
-                       struct qstr *qname, struct smb_fattr *fattr,
-                       unsigned char *name_buf)
-{
-        char *result;
-        unsigned int len = 0;
-        int n;
-        __u16 date, time;
-        int unicode = (server->mnt->flags & SMB_MOUNT_UNICODE);
-        /*
-         * SMB doesn't have a concept of inode numbers ...
-         */
-        smb_init_dirent(server, fattr);
-        fattr->f_ino = 0;       /* FIXME: do we need this? */
-        switch (level) {
-        case 1:
-                len = *((unsigned char *) p + 22);
-                qname->name = p + 23;
-                result = p + 24 + len;
-                date = WVAL(p, 0);
-                time = WVAL(p, 2);
-                fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-                fattr->f_ctime.tv_nsec = 0;
-                date = WVAL(p, 4);
-                time = WVAL(p, 6);
-                fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
-                fattr->f_atime.tv_nsec = 0;
-                date = WVAL(p, 8);
-                time = WVAL(p, 10);
-                fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-                fattr->f_mtime.tv_nsec = 0;
-                fattr->f_size = DVAL(p, 12);
-                /* ULONG allocation size */
-                fattr->attr = WVAL(p, 20);
-                VERBOSE("info 1 at %p, len=%d, name=%.*s\n",
-                        p, len, len, qname->name);
-                break;
-        case 260:
-                result = p + WVAL(p, 0);
-                len = DVAL(p, 60);
-                if (len > 255) len = 255;
-                /* NT4 null terminates, unless we are using unicode ... */
-                qname->name = p + 94;
-                if (!unicode && len && qname->name[len-1] == '\0')
-                        len--;
-                fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 8));
-                fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 16));
-                fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 24));
-                /* change time (32) */
-                fattr->f_size = LVAL(p, 40);
-                /* alloc size (48) */
-                fattr->attr = DVAL(p, 56);
-                VERBOSE("info 260 at %p, len=%d, name=%.*s\n",
-                        p, len, len, qname->name);
-                break;
-        case SMB_FIND_FILE_UNIX:
-                result = p + WVAL(p, 0);
-                qname->name = p + 108;
-                len = strlen(qname->name);
-                /* FIXME: should we check the length?? */
-                p += 8;
-                smb_decode_unix_basic(fattr, server, p);
-                VERBOSE("info SMB_FIND_FILE_UNIX at %p, len=%d, name=%.*s\n",
-                        p, len, len, qname->name);
-                break;
-        default:
-                PARANOIA("Unknown info level %d\n", level);
-                result = p + WVAL(p, 0);
-                goto out;
-        }
-        smb_finish_dirent(server, fattr);
-#if 0
-        /* FIXME: These only work for ascii chars, and recent smbmount doesn't
-           allow the flag to be set anyway. Remove? */
-        switch (server->opt.case_handling) {
-        case SMB_CASE_UPPER:
-                str_upper(qname->name, len);
-                break;
-        case SMB_CASE_LOWER:
-                str_lower(qname->name, len);
-                break;
-        default:
-                break;
-        }
-#endif
-        qname->len = 0;
-        n = server->ops->convert(name_buf, SMB_MAXNAMELEN,
-                                 qname->name, len,
-                                 server->remote_nls, server->local_nls);
-        if (n > 0) {
-                qname->len = n;
-                qname->name = name_buf;
-        }
-out:
-        return result;
-}
-/* findfirst/findnext flags */
-#define SMB_CLOSE_AFTER_FIRST (1<<0)
-#define SMB_CLOSE_IF_END (1<<1)
-#define SMB_REQUIRE_RESUME_KEY (1<<2)
-#define SMB_CONTINUE_BIT (1<<3)
-/*
- * Note: samba-2.0.7 (at least) has a very similar routine, cli_list, in
- * source/libsmb/clilist.c. When looking for smb bugs in the readdir code,
- * go there for advise.
- *
- * Bugs Noted:
- * (1) When using Info Level 1 Win NT 4.0 truncates directory listings 
- * for certain patterns of names and/or lengths. The breakage pattern
- * is completely reproducible and can be toggled by the creation of a
- * single file. (E.g. echo hi >foo breaks, rm -f foo works.)
- */
-static int
-smb_proc_readdir_long(struct file *filp, void *dirent, filldir_t filldir,
-                      struct smb_cache_control *ctl)
-{
-        struct dentry *dir = filp->f_path.dentry;
-        struct smb_sb_info *server = server_from_dentry(dir);
-        struct qstr qname;
-        struct smb_fattr fattr;
-        unsigned char *p, *lastname;
-        char *mask, *param;
-        __u16 command;
-        int first, entries_seen;
-        /* Both NT and OS/2 accept info level 1 (but see note below). */
-        int info_level = 260;
-        const int max_matches = 512;
-        unsigned int ff_searchcount = 0;
-        unsigned int ff_eos = 0;
-        unsigned int ff_lastname = 0;
-        unsigned int ff_dir_handle = 0;
-        unsigned int loop_count = 0;
-        unsigned int mask_len, i;
-        int result;
-        struct smb_request *req;
-        unsigned char *name_buf;
-        static struct qstr star = {
-                .name   = "*",
-                .len    = 1,
-        };
-        lock_kernel();
-        /*
-         * We always prefer unix style. Use info level 1 for older
-         * servers that don't do 260.
-         */
-        if (server->opt.capabilities & SMB_CAP_UNIX)
-                info_level = SMB_FIND_FILE_UNIX;
-        else if (server->opt.protocol < SMB_PROTOCOL_NT1)
-                info_level = 1;
-        result = -ENOMEM;
-        if (! (name_buf = kmalloc(SMB_MAXNAMELEN+2, GFP_KERNEL)))
-                goto out;
-        if (! (req = smb_alloc_request(server, server->opt.max_xmit)))
-                goto out_name;
-        param = req->rq_buffer;
-        /*
-         * Encode the initial path
-         */
-        mask = param + 12;
-        result = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dir, &star);
-        if (result <= 0)
-                goto out_free;
-        mask_len = result - 1;  /* mask_len is strlen, not #bytes */
-        result = 0;
-        first = 1;
-        VERBOSE("starting mask_len=%d, mask=%s\n", mask_len, mask);
-        entries_seen = 2;
-        ff_eos = 0;
-        while (ff_eos == 0) {
-                loop_count += 1;
-                if (loop_count > 10) {
-                        printk(KERN_WARNING "smb_proc_readdir_long: "
-                               "Looping in FIND_NEXT??\n");
-                        result = -EIO;
-                        break;
-                }
-                if (first != 0) {
-                        command = TRANSACT2_FINDFIRST;
-                        WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
-                        WSET(param, 2, max_matches);    /* max count */
-                        WSET(param, 4, SMB_CLOSE_IF_END);
-                        WSET(param, 6, info_level);
-                        DSET(param, 8, 0);
-                } else {
-                        command = TRANSACT2_FINDNEXT;
-                        VERBOSE("handle=0x%X, lastname=%d, mask=%.*s\n",
-                                ff_dir_handle, ff_lastname, mask_len, mask);
-                        WSET(param, 0, ff_dir_handle);  /* search handle */
-                        WSET(param, 2, max_matches);    /* max count */
-                        WSET(param, 4, info_level);
-                        DSET(param, 6, 0);
-                        WSET(param, 10, SMB_CONTINUE_BIT|SMB_CLOSE_IF_END);
-                }
-                req->rq_trans2_command = command;
-                req->rq_ldata = 0;
-                req->rq_data  = NULL;
-                req->rq_lparm = 12 + mask_len + 1;
-                req->rq_parm  = param;
-                req->rq_flags = 0;
-                result = smb_add_request(req);
-                if (result < 0) {
-                        PARANOIA("error=%d, breaking\n", result);
-                        break;
-                }
-                if (req->rq_rcls == ERRSRV && req->rq_err == ERRerror) {
-                        /* a damn Win95 bug - sometimes it clags if you 
-                           ask it too fast */
-                        schedule_timeout_interruptible(msecs_to_jiffies(200));
-                        continue;
-                }
-                if (req->rq_rcls != 0) {
-                        result = smb_errno(req);
-                        PARANOIA("name=%s, result=%d, rcls=%d, err=%d\n",
-                                 mask, result, req->rq_rcls, req->rq_err);
-                        break;
-                }
-                /* parse out some important return info */
-                if (first != 0) {
-                        ff_dir_handle = WVAL(req->rq_parm, 0);
-                        ff_searchcount = WVAL(req->rq_parm, 2);
-                        ff_eos = WVAL(req->rq_parm, 4);
-                        ff_lastname = WVAL(req->rq_parm, 8);
-                } else {
-                        ff_searchcount = WVAL(req->rq_parm, 0);
-                        ff_eos = WVAL(req->rq_parm, 2);
-                        ff_lastname = WVAL(req->rq_parm, 6);
-                }
-                if (ff_searchcount == 0)
-                        break;
-                /* Now we are ready to parse smb directory entries. */
-                /* point to the data bytes */
-                p = req->rq_data;
-                for (i = 0; i < ff_searchcount; i++) {
-                        /* make sure we stay within the buffer */
-                        if (p >= req->rq_data + req->rq_ldata) {
-                                printk(KERN_ERR "smb_proc_readdir_long: "
-                                       "dirent pointer outside buffer! "
-                                       "%p  %d@%p\n",
-                                       p, req->rq_ldata, req->rq_data);
-                                result = -EIO; /* always a comm. error? */
-                                goto out_free;
-                        }
-                        p = smb_decode_long_dirent(server, p, info_level,
-                                                   &qname, &fattr, name_buf);
-                        /* ignore . and .. from the server */
-                        if (entries_seen == 2 && qname.name[0] == '.') {
-                                if (qname.len == 1)
-                                        continue;
-                                if (qname.name[1] == '.' && qname.len == 2)
-                                        continue;
-                        }
-                        if (!smb_fill_cache(filp, dirent, filldir, ctl, 
-                                            &qname, &fattr))
-                                ;       /* stop reading? */
-                        entries_seen++;
-                }
-                VERBOSE("received %d entries, eos=%d\n", ff_searchcount,ff_eos);
-                /*
-                 * We might need the lastname for continuations.
-                 *
-                 * Note that some servers (win95?) point to the filename and
-                 * others (NT4, Samba using NT1) to the dir entry. We assume
-                 * here that those who do not point to a filename do not need
-                 * this info to continue the listing.
-                 *
-                 * OS/2 needs this and talks infolevel 1.
-                 * NetApps want lastname with infolevel 260.
-                 * win2k want lastname with infolevel 260, and points to
-                 *       the record not to the name.
-                 * Samba+CifsUnixExt doesn't need lastname.
-                 *
-                 * Both are happy if we return the data they point to. So we do.
-                 * (FIXME: above is not true with win2k)
-                 */
-                mask_len = 0;
-                if (info_level != SMB_FIND_FILE_UNIX &&
-                    ff_lastname > 0 && ff_lastname < req->rq_ldata) {
-                        lastname = req->rq_data + ff_lastname;
-                        switch (info_level) {
-                        case 260:
-                                mask_len = req->rq_ldata - ff_lastname;
-                                break;
-                        case 1:
-                                /* lastname points to a length byte */
-                                mask_len = *lastname++;
-                                if (ff_lastname + 1 + mask_len > req->rq_ldata)
-                                        mask_len = req->rq_ldata - ff_lastname - 1;
-                                break;
-                        }
-                        /*
-                         * Update the mask string for the next message.
-                         */
-                        if (mask_len > 255)
-                                mask_len = 255;
-                        if (mask_len)
-                                strncpy(mask, lastname, mask_len);
-                }
-                mask_len = strnlen(mask, mask_len);
-                VERBOSE("new mask, len=%d@%d of %d, mask=%.*s\n",
-                        mask_len, ff_lastname, req->rq_ldata, mask_len, mask);
-                first = 0;
-                loop_count = 0;
-        }
-out_free:
-        smb_rput(req);
-out_name:
-        kfree(name_buf);
-out:
-        unlock_kernel();
-        return result;
-}
-/*
- * This version uses the trans2 TRANSACT2_FINDFIRST message 
- * to get the attribute data.
- *
- * Bugs Noted:
- */
-static int
-smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
-                        struct smb_fattr *fattr)
-{
-        char *param, *mask;
-        __u16 date, time;
-        int mask_len, result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        mask = param + 12;
-        mask_len = smb_encode_path(server, mask, SMB_MAXPATHLEN+1, dentry,NULL);
-        if (mask_len < 0) {
-                result = mask_len;
-                goto out_free;
-        }
-        VERBOSE("name=%s, len=%d\n", mask, mask_len);
-        WSET(param, 0, aSYSTEM | aHIDDEN | aDIR);
-        WSET(param, 2, 1);      /* max count */
-        WSET(param, 4, 1);      /* close after this call */
-        WSET(param, 6, 1);      /* info_level */
-        DSET(param, 8, 0);
-        req->rq_trans2_command = TRANSACT2_FINDFIRST;
-        req->rq_ldata = 0;
-        req->rq_data  = NULL;
-        req->rq_lparm = 12 + mask_len;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        if (req->rq_rcls != 0) {
-                result = smb_errno(req);
-#ifdef SMBFS_PARANOIA
-                if (result != -ENOENT)
-                        PARANOIA("error for %s, rcls=%d, err=%d\n",
-                                 mask, req->rq_rcls, req->rq_err);
-#endif
-                goto out_free;
-        }
-        /* Make sure we got enough data ... */
-        result = -EINVAL;
-        if (req->rq_ldata < 22 || WVAL(req->rq_parm, 2) != 1) {
-                PARANOIA("bad result for %s, len=%d, count=%d\n",
-                         mask, req->rq_ldata, WVAL(req->rq_parm, 2));
-                goto out_free;
-        }
-        /*
-         * Decode the response into the fattr ...
-         */
-        date = WVAL(req->rq_data, 0);
-        time = WVAL(req->rq_data, 2);
-        fattr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-        fattr->f_ctime.tv_nsec = 0;
-        date = WVAL(req->rq_data, 4);
-        time = WVAL(req->rq_data, 6);
-        fattr->f_atime.tv_sec = date_dos2unix(server, date, time);
-        fattr->f_atime.tv_nsec = 0;
-        date = WVAL(req->rq_data, 8);
-        time = WVAL(req->rq_data, 10);
-        fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-        fattr->f_mtime.tv_nsec = 0;
-        VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n",
-                mask, date, time, fattr->f_mtime.tv_sec);
-        fattr->f_size = DVAL(req->rq_data, 12);
-        /* ULONG allocation size */
-        fattr->attr = WVAL(req->rq_data, 20);
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_getattr_core(struct smb_sb_info *server, struct dentry *dir,
-                      struct smb_fattr *fattr)
-{
-        int result;
-        char *p;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, SMBgetatr, 0, 0);
-        result = smb_simple_encode_path(req, &p, dir, NULL);
-        if (result < 0)
-                goto out_free;
-        smb_setup_bcc(req, p);
-        if ((result = smb_request_ok(req, SMBgetatr, 10, 0)) < 0)
-                goto out_free;
-        fattr->attr    = WVAL(req->rq_header, smb_vwv0);
-        fattr->f_mtime.tv_sec = local2utc(server, DVAL(req->rq_header, smb_vwv1));
-        fattr->f_mtime.tv_nsec = 0;
-        fattr->f_size  = DVAL(req->rq_header, smb_vwv3);
-        fattr->f_ctime = fattr->f_mtime; 
-        fattr->f_atime = fattr->f_mtime; 
-#ifdef SMBFS_DEBUG_TIMESTAMP
-        printk("getattr_core: %s/%s, mtime=%ld\n",
-               DENTRY_PATH(dir), fattr->f_mtime);
-#endif
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Bugs Noted:
- * (1) Win 95 swaps the date and time fields in the standard info level.
- */
-static int
-smb_proc_getattr_trans2(struct smb_sb_info *server, struct dentry *dir,
-                        struct smb_request *req, int infolevel)
-{
-        char *p, *param;
-        int result;
-        param = req->rq_buffer;
-        WSET(param, 0, infolevel);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
-        if (result < 0)
-                goto out;
-        p = param + 6 + result;
-        req->rq_trans2_command = TRANSACT2_QPATHINFO;
-        req->rq_ldata = 0;
-        req->rq_data  = NULL;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out;
-        if (req->rq_rcls != 0) {
-                VERBOSE("for %s: result=%d, rcls=%d, err=%d\n",
-                        &param[6], result, req->rq_rcls, req->rq_err);
-                result = smb_errno(req);
-                goto out;
-        }
-        result = -ENOENT;
-        if (req->rq_ldata < 22) {
-                PARANOIA("not enough data for %s, len=%d\n",
-                         &param[6], req->rq_ldata);
-                goto out;
-        }
-        result = 0;
-out:
-        return result;
-}
-static int
-smb_proc_getattr_trans2_std(struct smb_sb_info *server, struct dentry *dir,
-                            struct smb_fattr *attr)
-{
-        u16 date, time;
-        int off_date = 0, off_time = 2;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        result = smb_proc_getattr_trans2(server, dir, req, SMB_INFO_STANDARD);
-        if (result < 0)
-                goto out_free;
-        /*
-         * Kludge alert: Win 95 swaps the date and time field,
-         * contrary to the CIFS docs and Win NT practice.
-         */
-        if (server->mnt->flags & SMB_MOUNT_WIN95) {
-                off_date = 2;
-                off_time = 0;
-        }
-        date = WVAL(req->rq_data, off_date);
-        time = WVAL(req->rq_data, off_time);
-        attr->f_ctime.tv_sec = date_dos2unix(server, date, time);
-        attr->f_ctime.tv_nsec = 0;
-        date = WVAL(req->rq_data, 4 + off_date);
-        time = WVAL(req->rq_data, 4 + off_time);
-        attr->f_atime.tv_sec = date_dos2unix(server, date, time);
-        attr->f_atime.tv_nsec = 0;
-        date = WVAL(req->rq_data, 8 + off_date);
-        time = WVAL(req->rq_data, 8 + off_time);
-        attr->f_mtime.tv_sec = date_dos2unix(server, date, time);
-        attr->f_mtime.tv_nsec = 0;
-#ifdef SMBFS_DEBUG_TIMESTAMP
-        printk(KERN_DEBUG "getattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n",
-               DENTRY_PATH(dir), date, time, attr->f_mtime);
-#endif
-        attr->f_size = DVAL(req->rq_data, 12);
-        attr->attr = WVAL(req->rq_data, 20);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_getattr_trans2_all(struct smb_sb_info *server, struct dentry *dir,
-                            struct smb_fattr *attr)
-{
-        struct smb_request *req;
-        int result;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        result = smb_proc_getattr_trans2(server, dir, req,
-                                         SMB_QUERY_FILE_ALL_INFO);
-        if (result < 0)
-                goto out_free;
-        attr->f_ctime = smb_ntutc2unixutc(LVAL(req->rq_data, 0));
-        attr->f_atime = smb_ntutc2unixutc(LVAL(req->rq_data, 8));
-        attr->f_mtime = smb_ntutc2unixutc(LVAL(req->rq_data, 16));
-        /* change (24) */
-        attr->attr = WVAL(req->rq_data, 32);
-        /* pad? (34) */
-        /* allocated size (40) */
-        attr->f_size = LVAL(req->rq_data, 48);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_getattr_unix(struct smb_sb_info *server, struct dentry *dir,
-                      struct smb_fattr *attr)
-{
-        struct smb_request *req;
-        int result;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        result = smb_proc_getattr_trans2(server, dir, req,
-                                         SMB_QUERY_FILE_UNIX_BASIC);
-        if (result < 0)
-                goto out_free;
-        smb_decode_unix_basic(attr, server, req->rq_data);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_getattr_95(struct smb_sb_info *server, struct dentry *dir,
-                    struct smb_fattr *attr)
-{
-        struct inode *inode = dir->d_inode;
-        int result;
-        /* FIXME: why not use the "all" version? */
-        result = smb_proc_getattr_trans2_std(server, dir, attr);
-        if (result < 0)
-                goto out;
-        /*
-         * None of the getattr versions here can make win9x return the right
-         * filesize if there are changes made to an open file.
-         * A seek-to-end does return the right size, but we only need to do
-         * that on files we have written.
-         */
-        if (inode && SMB_I(inode)->flags & SMB_F_LOCALWRITE &&
-            smb_is_open(inode))
-        {
-                __u16 fileid = SMB_I(inode)->fileid;
-                attr->f_size = smb_proc_seek(server, fileid, 2, 0);
-        }
-out:
-        return result;
-}
-static int
-smb_proc_ops_wait(struct smb_sb_info *server)
-{
-        int result;
-        result = wait_event_interruptible_timeout(server->conn_wq,
-                                server->conn_complete, 30*HZ);
-        if (!result || signal_pending(current))
-                return -EIO;
-        return 0;
-}
-static int
-smb_proc_getattr_null(struct smb_sb_info *server, struct dentry *dir,
-                          struct smb_fattr *fattr)
-{
-        int result;
-        if (smb_proc_ops_wait(server) < 0)
-                return -EIO;
-        smb_init_dirent(server, fattr);
-        result = server->ops->getattr(server, dir, fattr);
-        smb_finish_dirent(server, fattr);
-        return result;
-}
-static int
-smb_proc_readdir_null(struct file *filp, void *dirent, filldir_t filldir,
-                      struct smb_cache_control *ctl)
-{
-        struct smb_sb_info *server = server_from_dentry(filp->f_path.dentry);
-        if (smb_proc_ops_wait(server) < 0)
-                return -EIO;
-        return server->ops->readdir(filp, dirent, filldir, ctl);
-}
-int
-smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr)
-{
-        struct smb_sb_info *server = server_from_dentry(dir);
-        int result;
-        smb_init_dirent(server, fattr);
-        result = server->ops->getattr(server, dir, fattr);
-        smb_finish_dirent(server, fattr);
-        return result;
-}
-/*
- * Because of bugs in the core protocol, we use this only to set
- * attributes. See smb_proc_settime() below for timestamp handling.
- *
- * Bugs Noted:
- * (1) If mtime is non-zero, both Win 3.1 and Win 95 fail
- * with an undocumented error (ERRDOS code 50). Setting
- * mtime to 0 allows the attributes to be set.
- * (2) The extra parameters following the name string aren't
- * in the CIFS docs, but seem to be necessary for operation.
- */
-static int
-smb_proc_setattr_core(struct smb_sb_info *server, struct dentry *dentry,
-                      __u16 attr)
-{
-        char *p;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        p = smb_setup_header(req, SMBsetatr, 8, 0);
-        WSET(req->rq_header, smb_vwv0, attr);
-        DSET(req->rq_header, smb_vwv1, 0); /* mtime */
-        WSET(req->rq_header, smb_vwv3, 0); /* reserved values */
-        WSET(req->rq_header, smb_vwv4, 0);
-        WSET(req->rq_header, smb_vwv5, 0);
-        WSET(req->rq_header, smb_vwv6, 0);
-        WSET(req->rq_header, smb_vwv7, 0);
-        result = smb_simple_encode_path(req, &p, dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        if (p + 2 > (char *)req->rq_buffer + req->rq_bufsize) {
-                result = -ENAMETOOLONG;
-                goto out_free;
-        }
-        *p++ = 4;
-        *p++ = 0;
-        smb_setup_bcc(req, p);
-        result = smb_request_ok(req, SMBsetatr, 0, 0);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Because of bugs in the trans2 setattr messages, we must set
- * attributes and timestamps separately. The core SMBsetatr
- * message seems to be the only reliable way to set attributes.
- */
-int
-smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr)
-{
-        struct smb_sb_info *server = server_from_dentry(dir);
-        int result;
-        VERBOSE("setting %s/%s, open=%d\n", 
-                DENTRY_PATH(dir), smb_is_open(dir->d_inode));
-        result = smb_proc_setattr_core(server, dir, fattr->attr);
-        return result;
-}
-/*
- * Sets the timestamps for an file open with write permissions.
- */
-static int
-smb_proc_setattr_ext(struct smb_sb_info *server,
-                      struct inode *inode, struct smb_fattr *fattr)
-{
-        __u16 date, time;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBsetattrE, 7, 0);
-        WSET(req->rq_header, smb_vwv0, SMB_I(inode)->fileid);
-        /* We don't change the creation time */
-        WSET(req->rq_header, smb_vwv1, 0);
-        WSET(req->rq_header, smb_vwv2, 0);
-        date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
-        WSET(req->rq_header, smb_vwv3, date);
-        WSET(req->rq_header, smb_vwv4, time);
-        date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
-        WSET(req->rq_header, smb_vwv5, date);
-        WSET(req->rq_header, smb_vwv6, time);
-#ifdef SMBFS_DEBUG_TIMESTAMP
-        printk(KERN_DEBUG "smb_proc_setattr_ext: date=%d, time=%d, mtime=%ld\n",
-               date, time, fattr->f_mtime);
-#endif
-        req->rq_flags |= SMB_REQ_NORETRY;
-        result = smb_request_ok(req, SMBsetattrE, 0, 0);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Bugs Noted:
- * (1) The TRANSACT2_SETPATHINFO message under Win NT 4.0 doesn't
- * set the file's attribute flags.
- */
-static int
-smb_proc_setattr_trans2(struct smb_sb_info *server,
-                        struct dentry *dir, struct smb_fattr *fattr)
-{
-        __u16 date, time;
-        char *p, *param;
-        int result;
-        char data[26];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        WSET(param, 0, 1);      /* Info level SMB_INFO_STANDARD */
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, dir, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        WSET(data, 0, 0); /* creation time */
-        WSET(data, 2, 0);
-        date_unix2dos(server, fattr->f_atime.tv_sec, &date, &time);
-        WSET(data, 4, date);
-        WSET(data, 6, time);
-        date_unix2dos(server, fattr->f_mtime.tv_sec, &date, &time);
-        WSET(data, 8, date);
-        WSET(data, 10, time);
-#ifdef SMBFS_DEBUG_TIMESTAMP
-        printk(KERN_DEBUG "setattr_trans2: %s/%s, date=%x, time=%x, mtime=%ld\n", 
-               DENTRY_PATH(dir), date, time, fattr->f_mtime);
-#endif
-        DSET(data, 12, 0); /* size */
-        DSET(data, 16, 0); /* blksize */
-        WSET(data, 20, 0); /* attr */
-        DSET(data, 22, 0); /* ULONG EA size */
-        req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-        req->rq_ldata = 26;
-        req->rq_data  = data;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        result = 0;
-        if (req->rq_rcls != 0)
-                result = smb_errno(req);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * ATTR_MODE      0x001
- * ATTR_UID       0x002
- * ATTR_GID       0x004
- * ATTR_SIZE      0x008
- * ATTR_ATIME     0x010
- * ATTR_MTIME     0x020
- * ATTR_CTIME     0x040
- * ATTR_ATIME_SET 0x080
- * ATTR_MTIME_SET 0x100
- * ATTR_FORCE     0x200 
- * ATTR_ATTR_FLAG 0x400
- *
- * major/minor should only be set by mknod.
- */
-int
-smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
-                      unsigned int major, unsigned int minor)
-{
-        struct smb_sb_info *server = server_from_dentry(d);
-        u64 nttime;
-        char *p, *param;
-        int result;
-        char data[100];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        DEBUG1("valid flags = 0x%04x\n", attr->ia_valid);
-        WSET(param, 0, SMB_SET_FILE_UNIX_BASIC);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        /* 0 L file size in bytes */
-        /* 8 L file size on disk in bytes (block count) */
-        /* 40 L uid */
-        /* 48 L gid */
-        /* 56 W file type enum */
-        /* 60 L devmajor */
-        /* 68 L devminor */
-        /* 76 L unique ID (inode) */
-        /* 84 L permissions */
-        /* 92 L link count */
-        LSET(data, 0, SMB_SIZE_NO_CHANGE);
-        LSET(data, 8, SMB_SIZE_NO_CHANGE);
-        LSET(data, 16, SMB_TIME_NO_CHANGE);
-        LSET(data, 24, SMB_TIME_NO_CHANGE);
-        LSET(data, 32, SMB_TIME_NO_CHANGE);
-        LSET(data, 40, SMB_UID_NO_CHANGE);
-        LSET(data, 48, SMB_GID_NO_CHANGE);
-        DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
-        LSET(data, 60, major);
-        LSET(data, 68, minor);
-        LSET(data, 76, 0);
-        LSET(data, 84, SMB_MODE_NO_CHANGE);
-        LSET(data, 92, 0);
-        if (attr->ia_valid & ATTR_SIZE) {
-                LSET(data, 0, attr->ia_size);
-                LSET(data, 8, 0); /* can't set anyway */
-        }
-        /*
-         * FIXME: check the conversion function it the correct one
-         *
-         * we can't set ctime but we might as well pass this to the server
-         * and let it ignore it.
-         */
-        if (attr->ia_valid & ATTR_CTIME) {
-                nttime = smb_unixutc2ntutc(attr->ia_ctime);
-                LSET(data, 16, nttime);
-        }
-        if (attr->ia_valid & ATTR_ATIME) {
-                nttime = smb_unixutc2ntutc(attr->ia_atime);
-                LSET(data, 24, nttime);
-        }
-        if (attr->ia_valid & ATTR_MTIME) {
-                nttime = smb_unixutc2ntutc(attr->ia_mtime);
-                LSET(data, 32, nttime);
-        }
-        
-        if (attr->ia_valid & ATTR_UID) {
-                LSET(data, 40, attr->ia_uid);
-        }
-        if (attr->ia_valid & ATTR_GID) {
-                LSET(data, 48, attr->ia_gid); 
-        }
-        
-        if (attr->ia_valid & ATTR_MODE) {
-                LSET(data, 84, attr->ia_mode);
-        }
-        req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-        req->rq_ldata = 100;
-        req->rq_data  = data;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Set the modify and access timestamps for a file.
- *
- * Incredibly enough, in all of SMB there is no message to allow
- * setting both attributes and timestamps at once. 
- *
- * Bugs Noted:
- * (1) Win 95 doesn't support the TRANSACT2_SETFILEINFO message 
- * with info level 1 (INFO_STANDARD).
- * (2) Win 95 seems not to support setting directory timestamps.
- * (3) Under the core protocol apparently the only way to set the
- * timestamp is to open and close the file.
- */
-int
-smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
-{
-        struct smb_sb_info *server = server_from_dentry(dentry);
-        struct inode *inode = dentry->d_inode;
-        int result;
-        VERBOSE("setting %s/%s, open=%d\n",
-                DENTRY_PATH(dentry), smb_is_open(inode));
-        /* setting the time on a Win95 server fails (tridge) */
-        if (server->opt.protocol >= SMB_PROTOCOL_LANMAN2 && 
-            !(server->mnt->flags & SMB_MOUNT_WIN95)) {
-                if (smb_is_open(inode) && SMB_I(inode)->access != SMB_O_RDONLY)
-                        result = smb_proc_setattr_ext(server, inode, fattr);
-                else
-                        result = smb_proc_setattr_trans2(server, dentry, fattr);
-        } else {
-                /*
-                 * Fail silently on directories ... timestamp can't be set?
-                 */
-                result = 0;
-                if (S_ISREG(inode->i_mode)) {
-                        /*
-                         * Set the mtime by opening and closing the file.
-                         * Note that the file is opened read-only, but this
-                         * still allows us to set the date (tridge)
-                         */
-                        result = -EACCES;
-                        if (!smb_is_open(inode))
-                                smb_proc_open(server, dentry, SMB_O_RDONLY);
-                        if (smb_is_open(inode)) {
-                                inode->i_mtime = fattr->f_mtime;
-                                result = smb_proc_close_inode(server, inode);
-                        }
-                }
-        }
-        return result;
-}
-int
-smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
-{
-        struct smb_sb_info *server = SMB_SB(dentry->d_sb);
-        int result;
-        char *p;
-        long unit;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 0)))
-                goto out;
-        smb_setup_header(req, SMBdskattr, 0, 0);
-        if ((result = smb_request_ok(req, SMBdskattr, 5, 0)) < 0)
-                goto out_free;
-        p = SMB_VWV(req->rq_header);
-        unit = (WVAL(p, 2) * WVAL(p, 4)) >> SMB_ST_BLKSHIFT;
-        attr->f_blocks = WVAL(p, 0) * unit;
-        attr->f_bsize  = SMB_ST_BLKSIZE;
-        attr->f_bavail = attr->f_bfree = WVAL(p, 6) * unit;
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-int
-smb_proc_read_link(struct smb_sb_info *server, struct dentry *d,
-                   char *buffer, int len)
-{
-        char *p, *param;
-        int result;
-        struct smb_request *req;
-        DEBUG1("readlink of %s/%s\n", DENTRY_PATH(d));
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        WSET(param, 0, SMB_QUERY_FILE_UNIX_LINK);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param+6, SMB_MAXPATHLEN+1, d, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        req->rq_trans2_command = TRANSACT2_QPATHINFO;
-        req->rq_ldata = 0;
-        req->rq_data  = NULL;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-                &param[6], result, req->rq_rcls, req->rq_err);
-        /* copy data up to the \0 or buffer length */
-        result = len;
-        if (req->rq_ldata < len)
-                result = req->rq_ldata;
-        strncpy(buffer, req->rq_data, result);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Create a symlink object called dentry which points to oldpath.
- * Samba does not permit dangling links but returns a suitable error message.
- */
-int
-smb_proc_symlink(struct smb_sb_info *server, struct dentry *d,
-                 const char *oldpath)
-{
-        char *p, *param;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        WSET(param, 0, SMB_SET_FILE_UNIX_LINK);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1, d, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-        req->rq_ldata = strlen(oldpath) + 1;
-        req->rq_data  = (char *) oldpath;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-                &param[6], result, req->rq_rcls, req->rq_err);
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-/*
- * Create a hard link object called new_dentry which points to dentry.
- */
-int
-smb_proc_link(struct smb_sb_info *server, struct dentry *dentry,
-              struct dentry *new_dentry)
-{
-        char *p, *param;
-        int result;
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, PAGE_SIZE)))
-                goto out;
-        param = req->rq_buffer;
-        WSET(param, 0, SMB_SET_FILE_UNIX_HLINK);
-        DSET(param, 2, 0);
-        result = smb_encode_path(server, param + 6, SMB_MAXPATHLEN+1,
-                                 new_dentry, NULL);
-        if (result < 0)
-                goto out_free;
-        p = param + 6 + result;
-        /* Grr, pointless separation of parameters and data ... */
-        req->rq_data = p;
-        req->rq_ldata = smb_encode_path(server, p, SMB_MAXPATHLEN+1,
-                                        dentry, NULL);
-        req->rq_trans2_command = TRANSACT2_SETPATHINFO;
-        req->rq_lparm = p - param;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        DEBUG1("for %s: result=%d, rcls=%d, err=%d\n",
-               &param[6], result, req->rq_rcls, req->rq_err);
-        result = 0;
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static int
-smb_proc_query_cifsunix(struct smb_sb_info *server)
-{
-        int result;
-        int major, minor;
-        u64 caps;
-        char param[2];
-        struct smb_request *req;
-        result = -ENOMEM;
-        if (! (req = smb_alloc_request(server, 100)))
-                goto out;
-        WSET(param, 0, SMB_QUERY_CIFS_UNIX_INFO);
-        req->rq_trans2_command = TRANSACT2_QFSINFO;
-        req->rq_ldata = 0;
-        req->rq_data  = NULL;
-        req->rq_lparm = 2;
-        req->rq_parm  = param;
-        req->rq_flags = 0;
-        result = smb_add_request(req);
-        if (result < 0)
-                goto out_free;
-        if (req->rq_ldata < 12) {
-                PARANOIA("Not enough data\n");
-                goto out_free;
-        }
-        major = WVAL(req->rq_data, 0);
-        minor = WVAL(req->rq_data, 2);
-        DEBUG1("Server implements CIFS Extensions for UNIX systems v%d.%d\n",
-               major, minor);
-        /* FIXME: verify that we are ok with this major/minor? */
-        caps = LVAL(req->rq_data, 4);
-        DEBUG1("Server capabilities 0x%016llx\n", caps);
-out_free:
-        smb_rput(req);
-out:
-        return result;
-}
-static void
-install_ops(struct smb_ops *dst, struct smb_ops *src)
-{
-        memcpy(dst, src, sizeof(void *) * SMB_OPS_NUM_STATIC);
-}
-/* < LANMAN2 */
-static struct smb_ops smb_ops_core =
-{
-        .read           = smb_proc_read,
-        .write          = smb_proc_write,
-        .readdir        = smb_proc_readdir_short,
-        .getattr        = smb_proc_getattr_core,
-        .truncate       = smb_proc_trunc32,
-};
-/* LANMAN2, OS/2, others? */
-static struct smb_ops smb_ops_os2 =
-{
-        .read           = smb_proc_read,
-        .write          = smb_proc_write,
-        .readdir        = smb_proc_readdir_long,
-        .getattr        = smb_proc_getattr_trans2_std,
-        .truncate       = smb_proc_trunc32,
-};
-/* Win95, and possibly some NetApp versions too */
-static struct smb_ops smb_ops_win95 =
-{
-        .read           = smb_proc_read,    /* does not support 12word readX */
-        .write          = smb_proc_write,
-        .readdir        = smb_proc_readdir_long,
-        .getattr        = smb_proc_getattr_95,
-        .truncate       = smb_proc_trunc95,
-};
-/* Samba, NT4 and NT5 */
-static struct smb_ops smb_ops_winNT =
-{
-        .read           = smb_proc_readX,
-        .write          = smb_proc_writeX,
-        .readdir        = smb_proc_readdir_long,
-        .getattr        = smb_proc_getattr_trans2_all,
-        .truncate       = smb_proc_trunc64,
-};
-/* Samba w/ unix extensions. Others? */
-static struct smb_ops smb_ops_unix =
-{
-        .read           = smb_proc_readX,
-        .write          = smb_proc_writeX,
-        .readdir        = smb_proc_readdir_long,
-        .getattr        = smb_proc_getattr_unix,
-        /* FIXME: core/ext/time setattr needs to be cleaned up! */
-        /* .setattr     = smb_proc_setattr_unix, */
-        .truncate       = smb_proc_trunc64,
-};
-/* Place holder until real ops are in place */
-static struct smb_ops smb_ops_null =
-{
-        .readdir        = smb_proc_readdir_null,
-        .getattr        = smb_proc_getattr_null,
-};
-void smb_install_null_ops(struct smb_ops *ops)
-{
-        install_ops(ops, &smb_ops_null);
-}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
deleted file mode 100644
index 05939a6f43e6..000000000000
--- a/fs/smbfs/proto.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Autogenerated with cproto on:  Sat Sep 13 17:18:51 CEST 2003
- */
-struct smb_request;
-struct sock;
-struct statfs;
-/* proc.c */
-extern int smb_setcodepage(struct smb_sb_info *server, struct smb_nls_codepage *cp);
-extern __u32 smb_len(__u8 *p);
-extern int smb_get_rsize(struct smb_sb_info *server);
-extern int smb_get_wsize(struct smb_sb_info *server);
-extern int smb_errno(struct smb_request *req);
-extern int smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt);
-extern __u8 *smb_setup_header(struct smb_request *req, __u8 command, __u16 wct, __u16 bcc);
-extern int smb_open(struct dentry *dentry, int wish);
-extern int smb_close(struct inode *ino);
-extern int smb_close_fileid(struct dentry *dentry, __u16 fileid);
-extern int smb_proc_create(struct dentry *dentry, __u16 attr, time_t ctime, __u16 *fileid);
-extern int smb_proc_mv(struct dentry *old_dentry, struct dentry *new_dentry);
-extern int smb_proc_mkdir(struct dentry *dentry);
-extern int smb_proc_rmdir(struct dentry *dentry);
-extern int smb_proc_unlink(struct dentry *dentry);
-extern int smb_proc_flush(struct smb_sb_info *server, __u16 fileid);
-extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr,
-                                 struct super_block *sb);
-extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
-extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
-extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
-extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
-extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
-extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
-extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
-extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
-extern void smb_install_null_ops(struct smb_ops *ops);
-/* dir.c */
-extern const struct file_operations smb_dir_operations;
-extern const struct inode_operations smb_dir_inode_operations;
-extern const struct inode_operations smb_dir_inode_operations_unix;
-extern void smb_new_dentry(struct dentry *dentry);
-extern void smb_renew_times(struct dentry *dentry);
-/* cache.c */
-extern void smb_invalid_dir_cache(struct inode *dir);
-extern void smb_invalidate_dircache_entries(struct dentry *parent);
-extern struct dentry *smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos);
-extern int smb_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct smb_cache_control *ctrl, struct qstr *qname, struct smb_fattr *entry);
-/* sock.c */
-extern void smb_data_ready(struct sock *sk, int len);
-extern int smb_valid_socket(struct inode *inode);
-extern void smb_close_socket(struct smb_sb_info *server);
-extern int smb_recv_available(struct smb_sb_info *server);
-extern int smb_receive_header(struct smb_sb_info *server);
-extern int smb_receive_drop(struct smb_sb_info *server);
-extern int smb_receive(struct smb_sb_info *server, struct smb_request *req);
-extern int smb_send_request(struct smb_request *req);
-/* inode.c */
-extern struct inode *smb_iget(struct super_block *sb, struct smb_fattr *fattr);
-extern void smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr);
-extern void smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr);
-extern void smb_invalidate_inodes(struct smb_sb_info *server);
-extern int smb_revalidate_inode(struct dentry *dentry);
-extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
-extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
-/* file.c */
-extern const struct address_space_operations smb_file_aops;
-extern const struct file_operations smb_file_operations;
-extern const struct inode_operations smb_file_inode_operations;
-/* ioctl.c */
-extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-/* smbiod.c */
-extern void smbiod_wake_up(void);
-extern int smbiod_register_server(struct smb_sb_info *server);
-extern void smbiod_unregister_server(struct smb_sb_info *server);
-extern void smbiod_flush(struct smb_sb_info *server);
-extern int smbiod_retry(struct smb_sb_info *server);
-/* request.c */
-extern int smb_init_request_cache(void);
-extern void smb_destroy_request_cache(void);
-extern struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize);
-extern void smb_rput(struct smb_request *req);
-extern int smb_add_request(struct smb_request *req);
-extern int smb_request_send_server(struct smb_sb_info *server);
-extern int smb_request_recv(struct smb_sb_info *server);
-/* symlink.c */
-extern int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname);
-extern const struct inode_operations smb_link_inode_operations;
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
deleted file mode 100644
index 45f45933e862..000000000000
--- a/fs/smbfs/request.c
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
- *  request.c
- *
- *  Copyright (C) 2001 by Urban Widmark
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/net.h>
-#include <linux/sched.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-#include "smb_debug.h"
-#include "request.h"
-#include "proto.h"
-/* #define SMB_SLAB_DEBUG       (SLAB_RED_ZONE | SLAB_POISON) */
-#define SMB_SLAB_DEBUG  0
-/* cache for request structures */
-static struct kmem_cache *req_cachep;
-static int smb_request_send_req(struct smb_request *req);
-/*
-  /proc/slabinfo:
-  name, active, num, objsize, active_slabs, num_slaps, #pages
-*/
-int smb_init_request_cache(void)
-{
-        req_cachep = kmem_cache_create("smb_request",
-                                       sizeof(struct smb_request), 0,
-                                       SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN,
-                                       NULL);
-        if (req_cachep == NULL)
-                return -ENOMEM;
-        return 0;
-}
-void smb_destroy_request_cache(void)
-{
-        kmem_cache_destroy(req_cachep);
-}
-/*
- * Allocate and initialise a request structure
- */
-static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
-                                                int bufsize)
-{
-        struct smb_request *req;
-        unsigned char *buf = NULL;
-        req = kmem_cache_zalloc(req_cachep, GFP_KERNEL);
-        VERBOSE("allocating request: %p\n", req);
-        if (!req)
-                goto out;
-        if (bufsize > 0) {
-                buf = kmalloc(bufsize, GFP_NOFS);
-                if (!buf) {
-                        kmem_cache_free(req_cachep, req);
-                        return NULL;
-                }
-        }
-        req->rq_buffer = buf;
-        req->rq_bufsize = bufsize;
-        req->rq_server = server;
-        init_waitqueue_head(&req->rq_wait);
-        INIT_LIST_HEAD(&req->rq_queue);
-        atomic_set(&req->rq_count, 1);
-out:
-        return req;
-}
-struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize)
-{
-        struct smb_request *req = NULL;
-        for (;;) {
-                atomic_inc(&server->nr_requests);
-                if (atomic_read(&server->nr_requests) <= MAX_REQUEST_HARD) {
-                        req = smb_do_alloc_request(server, bufsize);
-                        if (req != NULL)
-                                break;
-                }
-#if 0
-                /*
-                 * Try to free up at least one request in order to stay
-                 * below the hard limit
-                 */
-                if (nfs_try_to_free_pages(server))
-                        continue;
-                if (fatal_signal_pending(current))
-                        return ERR_PTR(-ERESTARTSYS);
-                current->policy = SCHED_YIELD;
-                schedule();
-#else
-                /* FIXME: we want something like nfs does above, but that
-                   requires changes to all callers and can wait. */
-                break;
-#endif
-        }
-        return req;
-}
-static void smb_free_request(struct smb_request *req)
-{
-        atomic_dec(&req->rq_server->nr_requests);
-        if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC))
-                kfree(req->rq_buffer);
-        kfree(req->rq_trans2buffer);
-        kmem_cache_free(req_cachep, req);
-}
-/*
- * What prevents a rget to race with a rput? The count must never drop to zero
- * while it is in use. Only rput if it is ok that it is free'd.
- */
-static void smb_rget(struct smb_request *req)
-{
-        atomic_inc(&req->rq_count);
-}
-void smb_rput(struct smb_request *req)
-{
-        if (atomic_dec_and_test(&req->rq_count)) {
-                list_del_init(&req->rq_queue);
-                smb_free_request(req);
-        }
-}
-/* setup to receive the data part of the SMB */
-static int smb_setup_bcc(struct smb_request *req)
-{
-        int result = 0;
-        req->rq_rlen = smb_len(req->rq_header) + 4 - req->rq_bytes_recvd;
-        if (req->rq_rlen > req->rq_bufsize) {
-                PARANOIA("Packet too large %d > %d\n",
-                         req->rq_rlen, req->rq_bufsize);
-                return -ENOBUFS;
-        }
-        req->rq_iov[0].iov_base = req->rq_buffer;
-        req->rq_iov[0].iov_len  = req->rq_rlen;
-        req->rq_iovlen = 1;
-        return result;
-}
-/*
- * Prepare a "normal" request structure.
- */
-static int smb_setup_request(struct smb_request *req)
-{
-        int len = smb_len(req->rq_header) + 4;
-        req->rq_slen = len;
-        /* if we expect a data part in the reply we set the iov's to read it */
-        if (req->rq_resp_bcc)
-                req->rq_setup_read = smb_setup_bcc;
-        /* This tries to support re-using the same request */
-        req->rq_bytes_sent = 0;
-        req->rq_rcls = 0;
-        req->rq_err = 0;
-        req->rq_errno = 0;
-        req->rq_fragment = 0;
-        kfree(req->rq_trans2buffer);
-        req->rq_trans2buffer = NULL;
-        return 0;
-}
-/*
- * Prepare a transaction2 request structure
- */
-static int smb_setup_trans2request(struct smb_request *req)
-{
-        struct smb_sb_info *server = req->rq_server;
-        int mparam, mdata;
-        static unsigned char padding[4];
-        /* I know the following is very ugly, but I want to build the
-           smb packet as efficiently as possible. */
-        const int smb_parameters = 15;
-        const int header = SMB_HEADER_LEN + 2 * smb_parameters + 2;
-        const int oparam = ALIGN(header + 3, sizeof(u32));
-        const int odata  = ALIGN(oparam + req->rq_lparm, sizeof(u32));
-        const int bcc = (req->rq_data ? odata + req->rq_ldata :
-                                        oparam + req->rq_lparm) - header;
-        if ((bcc + oparam) > server->opt.max_xmit)
-                return -ENOMEM;
-        smb_setup_header(req, SMBtrans2, smb_parameters, bcc);
-        /*
-         * max parameters + max data + max setup == bufsize to make NT4 happy
-         * and not abort the transfer or split into multiple responses. It also
-         * makes smbfs happy as handling packets larger than the buffer size
-         * is extra work.
-         *
-         * OS/2 is probably going to hate me for this ...
-         */
-        mparam = SMB_TRANS2_MAX_PARAM;
-        mdata = req->rq_bufsize - mparam;
-        mdata = server->opt.max_xmit - mparam - 100;
-        if (mdata < 1024) {
-                mdata = 1024;
-                mparam = 20;
-        }
-#if 0
-        /* NT/win2k has ~4k max_xmit, so with this we request more than it wants
-           to return as one SMB. Useful for testing the fragmented trans2
-           handling. */
-        mdata = 8192;
-#endif
-        WSET(req->rq_header, smb_tpscnt, req->rq_lparm);
-        WSET(req->rq_header, smb_tdscnt, req->rq_ldata);
-        WSET(req->rq_header, smb_mprcnt, mparam);
-        WSET(req->rq_header, smb_mdrcnt, mdata);
-        WSET(req->rq_header, smb_msrcnt, 0);    /* max setup always 0 ? */
-        WSET(req->rq_header, smb_flags, 0);
-        DSET(req->rq_header, smb_timeout, 0);
-        WSET(req->rq_header, smb_pscnt, req->rq_lparm);
-        WSET(req->rq_header, smb_psoff, oparam - 4);
-        WSET(req->rq_header, smb_dscnt, req->rq_ldata);
-        WSET(req->rq_header, smb_dsoff, req->rq_data ? odata - 4 : 0);
-        *(req->rq_header + smb_suwcnt) = 0x01;          /* setup count */
-        *(req->rq_header + smb_suwcnt + 1) = 0x00;      /* reserved */
-        WSET(req->rq_header, smb_setup0, req->rq_trans2_command);
-        req->rq_iovlen = 2;
-        req->rq_iov[0].iov_base = (void *) req->rq_header;
-        req->rq_iov[0].iov_len = oparam;
-        req->rq_iov[1].iov_base = (req->rq_parm==NULL) ? padding : req->rq_parm;
-        req->rq_iov[1].iov_len = req->rq_lparm;
-        req->rq_slen = oparam + req->rq_lparm;
-        if (req->rq_data) {
-                req->rq_iovlen += 2;
-                req->rq_iov[2].iov_base = padding;
-                req->rq_iov[2].iov_len = odata - oparam - req->rq_lparm;
-                req->rq_iov[3].iov_base = req->rq_data;
-                req->rq_iov[3].iov_len = req->rq_ldata;
-                req->rq_slen = odata + req->rq_ldata;
-        }
-        /* always a data part for trans2 replies */
-        req->rq_setup_read = smb_setup_bcc;
-        return 0;
-}
-/*
- * Add a request and tell smbiod to process it
- */
-int smb_add_request(struct smb_request *req)
-{
-        long timeleft;
-        struct smb_sb_info *server = req->rq_server;
-        int result = 0;
-        smb_setup_request(req);
-        if (req->rq_trans2_command) {
-                if (req->rq_buffer == NULL) {
-                        PARANOIA("trans2 attempted without response buffer!\n");
-                        return -EIO;
-                }
-                result = smb_setup_trans2request(req);
-        }
-        if (result < 0)
-                return result;
-#ifdef SMB_DEBUG_PACKET_SIZE
-        add_xmit_stats(req);
-#endif
-        /* add 'req' to the queue of requests */
-        if (smb_lock_server_interruptible(server))
-                return -EINTR;
-        /*
-         * Try to send the request as the process. If that fails we queue the
-         * request and let smbiod send it later.
-         */
-        /* FIXME: each server has a number on the maximum number of parallel
-           requests. 10, 50 or so. We should not allow more requests to be
-           active. */
-        if (server->mid > 0xf000)
-                server->mid = 0;
-        req->rq_mid = server->mid++;
-        WSET(req->rq_header, smb_mid, req->rq_mid);
-        result = 0;
-        if (server->state == CONN_VALID) {
-                if (list_empty(&server->xmitq))
-                        result = smb_request_send_req(req);
-                if (result < 0) {
-                        /* Connection lost? */
-                        server->conn_error = result;
-                        server->state = CONN_INVALID;
-                }
-        }
-        if (result != 1)
-                list_add_tail(&req->rq_queue, &server->xmitq);
-        smb_rget(req);
-        if (server->state != CONN_VALID)
-                smbiod_retry(server);
-        smb_unlock_server(server);
-        smbiod_wake_up();
-        timeleft = wait_event_interruptible_timeout(req->rq_wait,
-                                    req->rq_flags & SMB_REQ_RECEIVED, 30*HZ);
-        if (!timeleft || signal_pending(current)) {
-                /*
-                 * On timeout or on interrupt we want to try and remove the
-                 * request from the recvq/xmitq.
-                 * First check if the request is still part of a queue. (May
-                 * have been removed by some error condition)
-                 */
-                smb_lock_server(server);
-                if (!list_empty(&req->rq_queue)) {
-                        list_del_init(&req->rq_queue);
-                        smb_rput(req);
-                }
-                smb_unlock_server(server);
-        }
-        if (!timeleft) {
-                PARANOIA("request [%p, mid=%d] timed out!\n",
-                         req, req->rq_mid);
-                VERBOSE("smb_com:  %02x\n", *(req->rq_header + smb_com));
-                VERBOSE("smb_rcls: %02x\n", *(req->rq_header + smb_rcls));
-                VERBOSE("smb_flg:  %02x\n", *(req->rq_header + smb_flg));
-                VERBOSE("smb_tid:  %04x\n", WVAL(req->rq_header, smb_tid));
-                VERBOSE("smb_pid:  %04x\n", WVAL(req->rq_header, smb_pid));
-                VERBOSE("smb_uid:  %04x\n", WVAL(req->rq_header, smb_uid));
-                VERBOSE("smb_mid:  %04x\n", WVAL(req->rq_header, smb_mid));
-                VERBOSE("smb_wct:  %02x\n", *(req->rq_header + smb_wct));
-                req->rq_rcls = ERRSRV;
-                req->rq_err  = ERRtimeout;
-                /* Just in case it was "stuck" */
-                smbiod_wake_up();
-        }
-        VERBOSE("woke up, rcls=%d\n", req->rq_rcls);
-        if (req->rq_rcls != 0)
-                req->rq_errno = smb_errno(req);
-        if (signal_pending(current))
-                req->rq_errno = -ERESTARTSYS;
-        return req->rq_errno;
-}
-/*
- * Send a request and place it on the recvq if successfully sent.
- * Must be called with the server lock held.
- */
-static int smb_request_send_req(struct smb_request *req)
-{
-        struct smb_sb_info *server = req->rq_server;
-        int result;
-        if (req->rq_bytes_sent == 0) {
-                WSET(req->rq_header, smb_tid, server->opt.tid);
-                WSET(req->rq_header, smb_pid, 1);
-                WSET(req->rq_header, smb_uid, server->opt.server_uid);
-        }
-        result = smb_send_request(req);
-        if (result < 0 && result != -EAGAIN)
-                goto out;
-        result = 0;
-        if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
-                goto out;
-        list_move_tail(&req->rq_queue, &server->recvq);
-        result = 1;
-out:
-        return result;
-}
-/*
- * Sends one request for this server. (smbiod)
- * Must be called with the server lock held.
- * Returns: <0 on error
- *           0 if no request could be completely sent
- *           1 if all data for one request was sent
- */
-int smb_request_send_server(struct smb_sb_info *server)
-{
-        struct list_head *head;
-        struct smb_request *req;
-        int result;
-        if (server->state != CONN_VALID)
-                return 0;
-        /* dequeue first request, if any */
-        req = NULL;
-        head = server->xmitq.next;
-        if (head != &server->xmitq) {
-                req = list_entry(head, struct smb_request, rq_queue);
-        }
-        if (!req)
-                return 0;
-        result = smb_request_send_req(req);
-        if (result < 0) {
-                server->conn_error = result;
-                list_move(&req->rq_queue, &server->xmitq);
-                result = -EIO;
-                goto out;
-        }
-out:
-        return result;
-}
-/*
- * Try to find a request matching this "mid". Typically the first entry will
- * be the matching one.
- */
-static struct smb_request *find_request(struct smb_sb_info *server, int mid)
-{
-        struct list_head *tmp;
-        struct smb_request *req = NULL;
-        list_for_each(tmp, &server->recvq) {
-                req = list_entry(tmp, struct smb_request, rq_queue);
-                if (req->rq_mid == mid) {
-                        break;
-                }
-                req = NULL;
-        }
-        if (!req) {
-                VERBOSE("received reply with mid %d but no request!\n",
-                        WVAL(server->header, smb_mid));
-                server->rstate = SMB_RECV_DROP;
-        }
-        return req;
-}
-/*
- * Called when we have read the smb header and believe this is a response.
- */
-static int smb_init_request(struct smb_sb_info *server, struct smb_request *req)
-{
-        int hdrlen, wct;
-        memcpy(req->rq_header, server->header, SMB_HEADER_LEN);
-        wct = *(req->rq_header + smb_wct);
-        if (wct > 20) { 
-                PARANOIA("wct too large, %d > 20\n", wct);
-                server->rstate = SMB_RECV_DROP;
-                return 0;
-        }
-        req->rq_resp_wct = wct;
-        hdrlen = SMB_HEADER_LEN + wct*2 + 2;
-        VERBOSE("header length: %d   smb_wct: %2d\n", hdrlen, wct);
-        req->rq_bytes_recvd = SMB_HEADER_LEN;
-        req->rq_rlen = hdrlen;
-        req->rq_iov[0].iov_base = req->rq_header;
-        req->rq_iov[0].iov_len  = hdrlen;
-        req->rq_iovlen = 1;
-        server->rstate = SMB_RECV_PARAM;
-#ifdef SMB_DEBUG_PACKET_SIZE
-        add_recv_stats(smb_len(server->header));
-#endif
-        return 0;
-}
-/*
- * Reads the SMB parameters
- */
-static int smb_recv_param(struct smb_sb_info *server, struct smb_request *req)
-{
-        int result;
-        result = smb_receive(server, req);
-        if (result < 0)
-                return result;
-        if (req->rq_bytes_recvd < req->rq_rlen)
-                return 0;
-        VERBOSE("result: %d   smb_bcc:  %04x\n", result,
-                WVAL(req->rq_header, SMB_HEADER_LEN +
-                     (*(req->rq_header + smb_wct) * 2)));
-        result = 0;
-        req->rq_iov[0].iov_base = NULL;
-        req->rq_rlen = 0;
-        if (req->rq_callback)
-                req->rq_callback(req);
-        else if (req->rq_setup_read)
-                result = req->rq_setup_read(req);
-        if (result < 0) {
-                server->rstate = SMB_RECV_DROP;
-                return result;
-        }
-        server->rstate = req->rq_rlen > 0 ? SMB_RECV_DATA : SMB_RECV_END;
-        req->rq_bytes_recvd = 0;        // recvd out of the iov
-        VERBOSE("rlen: %d\n", req->rq_rlen);
-        if (req->rq_rlen < 0) {
-                PARANOIA("Parameters read beyond end of packet!\n");
-                server->rstate = SMB_RECV_END;
-                return -EIO;
-        }
-        return 0;
-}
-/*
- * Reads the SMB data
- */
-static int smb_recv_data(struct smb_sb_info *server, struct smb_request *req)
-{
-        int result;
-        result = smb_receive(server, req);
-        if (result < 0)
-                goto out;
-        if (req->rq_bytes_recvd < req->rq_rlen)
-                goto out;
-        server->rstate = SMB_RECV_END;
-out:
-        VERBOSE("result: %d\n", result);
-        return result;
-}
-/*
- * Receive a transaction2 response
- * Return: 0 if the response has been fully read
- *         1 if there are further "fragments" to read
- *        <0 if there is an error
- */
-static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req)
-{
-        unsigned char *inbuf;
-        unsigned int parm_disp, parm_offset, parm_count, parm_tot;
-        unsigned int data_disp, data_offset, data_count, data_tot;
-        int hdrlen = SMB_HEADER_LEN + req->rq_resp_wct*2 - 2;
-        VERBOSE("handling trans2\n");
-        inbuf = req->rq_header;
-        data_tot    = WVAL(inbuf, smb_tdrcnt);
-        parm_tot    = WVAL(inbuf, smb_tprcnt);
-        parm_disp   = WVAL(inbuf, smb_prdisp);
-        parm_offset = WVAL(inbuf, smb_proff);
-        parm_count  = WVAL(inbuf, smb_prcnt);
-        data_disp   = WVAL(inbuf, smb_drdisp);
-        data_offset = WVAL(inbuf, smb_droff);
-        data_count  = WVAL(inbuf, smb_drcnt);
-        /* Modify offset for the split header/buffer we use */
-        if (data_count || data_offset) {
-                if (unlikely(data_offset < hdrlen))
-                        goto out_bad_data;
-                else
-                        data_offset -= hdrlen;
-        }
-        if (parm_count || parm_offset) {
-                if (unlikely(parm_offset < hdrlen))
-                        goto out_bad_parm;
-                else
-                        parm_offset -= hdrlen;
-        }
-        if (parm_count == parm_tot && data_count == data_tot) {
-                /*
-                 * This packet has all the trans2 data.
-                 *
-                 * We setup the request so that this will be the common
-                 * case. It may be a server error to not return a
-                 * response that fits.
-                 */
-                VERBOSE("single trans2 response  "
-                        "dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
-                        data_count, parm_count,
-                        data_offset, parm_offset);
-                req->rq_ldata = data_count;
-                req->rq_lparm = parm_count;
-                req->rq_data = req->rq_buffer + data_offset;
-                req->rq_parm = req->rq_buffer + parm_offset;
-                if (unlikely(parm_offset + parm_count > req->rq_rlen))
-                        goto out_bad_parm;
-                if (unlikely(data_offset + data_count > req->rq_rlen))
-                        goto out_bad_data;
-                return 0;
-        }
-        VERBOSE("multi trans2 response  "
-                "frag=%d, dcnt=%u, pcnt=%u, doff=%u, poff=%u\n",
-                req->rq_fragment,
-                data_count, parm_count,
-                data_offset, parm_offset);
-        if (!req->rq_fragment) {
-                int buf_len;
-                /* We got the first trans2 fragment */
-                req->rq_fragment = 1;
-                req->rq_total_data = data_tot;
-                req->rq_total_parm = parm_tot;
-                req->rq_ldata = 0;
-                req->rq_lparm = 0;
-                buf_len = data_tot + parm_tot;
-                if (buf_len > SMB_MAX_PACKET_SIZE)
-                        goto out_too_long;
-                req->rq_trans2bufsize = buf_len;
-                req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS);
-                if (!req->rq_trans2buffer)
-                        goto out_no_mem;
-                req->rq_parm = req->rq_trans2buffer;
-                req->rq_data = req->rq_trans2buffer + parm_tot;
-        } else if (unlikely(req->rq_total_data < data_tot ||
-                            req->rq_total_parm < parm_tot))
-                goto out_data_grew;
-        if (unlikely(parm_disp + parm_count > req->rq_total_parm ||
-                     parm_offset + parm_count > req->rq_rlen))
-                goto out_bad_parm;
-        if (unlikely(data_disp + data_count > req->rq_total_data ||
-                     data_offset + data_count > req->rq_rlen))
-                goto out_bad_data;
-        inbuf = req->rq_buffer;
-        memcpy(req->rq_parm + parm_disp, inbuf + parm_offset, parm_count);
-        memcpy(req->rq_data + data_disp, inbuf + data_offset, data_count);
-        req->rq_ldata += data_count;
-        req->rq_lparm += parm_count;
-        /*
-         * Check whether we've received all of the data. Note that
-         * we use the packet totals -- total lengths might shrink!
-         */
-        if (req->rq_ldata >= data_tot && req->rq_lparm >= parm_tot) {
-                req->rq_ldata = data_tot;
-                req->rq_lparm = parm_tot;
-                return 0;
-        }
-        return 1;
-out_too_long:
-        printk(KERN_ERR "smb_trans2: data/param too long, data=%u, parm=%u\n",
-                data_tot, parm_tot);
-        goto out_EIO;
-out_no_mem:
-        printk(KERN_ERR "smb_trans2: couldn't allocate data area of %d bytes\n",
-               req->rq_trans2bufsize);
-        req->rq_errno = -ENOMEM;
-        goto out;
-out_data_grew:
-        printk(KERN_ERR "smb_trans2: data/params grew!\n");
-        goto out_EIO;
-out_bad_parm:
-        printk(KERN_ERR "smb_trans2: invalid parms, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
-               parm_disp, parm_count, parm_tot, parm_offset);
-        goto out_EIO;
-out_bad_data:
-        printk(KERN_ERR "smb_trans2: invalid data, disp=%u, cnt=%u, tot=%u, ofs=%u\n",
-               data_disp, data_count, data_tot, data_offset);
-out_EIO:
-        req->rq_errno = -EIO;
-out:
-        return req->rq_errno;
-}
-/*
- * State machine for receiving responses. We handle the fact that we can't
- * read the full response in one try by having states telling us how much we
- * have read.
- *
- * Must be called with the server lock held (only called from smbiod).
- *
- * Return: <0 on error
- */
-int smb_request_recv(struct smb_sb_info *server)
-{
-        struct smb_request *req = NULL;
-        int result = 0;
-        if (smb_recv_available(server) <= 0)
-                return 0;
-        VERBOSE("state: %d\n", server->rstate);
-        switch (server->rstate) {
-        case SMB_RECV_DROP:
-                result = smb_receive_drop(server);
-                if (result < 0)
-                        break;
-                if (server->rstate == SMB_RECV_DROP)
-                        break;
-                server->rstate = SMB_RECV_START;
-                /* fallthrough */
-        case SMB_RECV_START:
-                server->smb_read = 0;
-                server->rstate = SMB_RECV_HEADER;
-                /* fallthrough */
-        case SMB_RECV_HEADER:
-                result = smb_receive_header(server);
-                if (result < 0)
-                        break;
-                if (server->rstate == SMB_RECV_HEADER)
-                        break;
-                if (! (*(server->header + smb_flg) & SMB_FLAGS_REPLY) ) {
-                        server->rstate = SMB_RECV_REQUEST;
-                        break;
-                }
-                if (server->rstate != SMB_RECV_HCOMPLETE)
-                        break;
-                /* fallthrough */
-        case SMB_RECV_HCOMPLETE:
-                req = find_request(server, WVAL(server->header, smb_mid));
-                if (!req)
-                        break;
-                smb_init_request(server, req);
-                req->rq_rcls = *(req->rq_header + smb_rcls);
-                req->rq_err  = WVAL(req->rq_header, smb_err);
-                if (server->rstate != SMB_RECV_PARAM)
-                        break;
-                /* fallthrough */
-        case SMB_RECV_PARAM:
-                if (!req)
-                        req = find_request(server,WVAL(server->header,smb_mid));
-                if (!req)
-                        break;
-                result = smb_recv_param(server, req);
-                if (result < 0)
-                        break;
-                if (server->rstate != SMB_RECV_DATA)
-                        break;
-                /* fallthrough */
-        case SMB_RECV_DATA:
-                if (!req)
-                        req = find_request(server,WVAL(server->header,smb_mid));
-                if (!req)
-                        break;
-                result = smb_recv_data(server, req);
-                if (result < 0)
-                        break;
-                break;
-                /* We should never be called with any of these states */
-        case SMB_RECV_END:
-        case SMB_RECV_REQUEST:
-                BUG();
-        }
-        if (result < 0) {
-                /* We saw an error */
-                return result;
-        }
-        if (server->rstate != SMB_RECV_END)
-                return 0;
-        result = 0;
-        if (req->rq_trans2_command && req->rq_rcls == SUCCESS)
-                result = smb_recv_trans2(server, req);
-        /*
-         * Response completely read. Drop any extra bytes sent by the server.
-         * (Yes, servers sometimes add extra bytes to responses)
-         */
-        VERBOSE("smb_len: %d   smb_read: %d\n",
-                server->smb_len, server->smb_read);
-        if (server->smb_read < server->smb_len)
-                smb_receive_drop(server);
-        server->rstate = SMB_RECV_START;
-        if (!result) {
-                list_del_init(&req->rq_queue);
-                req->rq_flags |= SMB_REQ_RECEIVED;
-                smb_rput(req);
-                wake_up_interruptible(&req->rq_wait);
-        }
-        return 0;
-}
diff --git a/fs/smbfs/request.h b/fs/smbfs/request.h
deleted file mode 100644
index efb21451e7c9..000000000000
--- a/fs/smbfs/request.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/uio.h>
-#include <linux/wait.h>
-struct smb_request {
-        struct list_head rq_queue;      /* recvq or xmitq for the server */
-        atomic_t rq_count;
-        wait_queue_head_t rq_wait;
-        int rq_flags;
-        int rq_mid;     /* multiplex ID, set by request.c */
-        struct smb_sb_info *rq_server;
-        /* header + word count + parameter words + byte count */
-        unsigned char rq_header[SMB_HEADER_LEN + 20*2 + 2];
-        int rq_bufsize;
-        unsigned char *rq_buffer;
-        /* FIXME: this is not good enough for merging IO requests. */
-        unsigned char *rq_page;
-        int rq_rsize;
-        int rq_resp_wct;
-        int rq_resp_bcc;
-        int rq_rlen;
-        int rq_bytes_recvd;
-        int rq_slen;
-        int rq_bytes_sent;
-        int rq_iovlen;
-        struct kvec rq_iov[4];
-        int (*rq_setup_read) (struct smb_request *);
-        void (*rq_callback) (struct smb_request *);
-        /* ------ trans2 stuff ------ */
-        u16 rq_trans2_command;  /* 0 if not a trans2 request */
-        unsigned int rq_ldata;
-        unsigned char *rq_data;
-        unsigned int rq_lparm;
-        unsigned char *rq_parm;
-        int rq_fragment;
-        u32 rq_total_data;
-        u32 rq_total_parm;
-        int rq_trans2bufsize;
-        unsigned char *rq_trans2buffer;
-        /* ------ response ------ */
-        unsigned short rq_rcls;
-        unsigned short rq_err;
-        int rq_errno;
-};
-#define SMB_REQ_STATIC          0x0001  /* rq_buffer is static */
-#define SMB_REQ_NORETRY         0x0002  /* request is invalid after retry */
-#define SMB_REQ_TRANSMITTED     0x4000  /* all data has been sent */
-#define SMB_REQ_RECEIVED        0x8000  /* reply received, smbiod is done */
-#define xSMB_REQ_NOREPLY        0x0004  /* we don't want the reply (if any) */
-#define xSMB_REQ_NORECEIVER     0x0008  /* caller doesn't wait for response */
diff --git a/fs/smbfs/smb_debug.h b/fs/smbfs/smb_debug.h
deleted file mode 100644
index fc4b1a5dd755..000000000000
--- a/fs/smbfs/smb_debug.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Defines some debug macros for smbfs.
- */
-/* This makes a dentry parent/child name pair. Useful for debugging printk's */
-#define DENTRY_PATH(dentry) \
-        (dentry)->d_parent->d_name.name,(dentry)->d_name.name
-/*
- * safety checks that should never happen ???
- * these are normally enabled.
- */
-#ifdef SMBFS_PARANOIA
-# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a)
-#else
-# define PARANOIA(f, a...) do { ; } while(0)
-#endif
-/* lots of debug messages */
-#ifdef SMBFS_DEBUG_VERBOSE
-# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
-#else
-# define VERBOSE(f, a...) do { ; } while(0)
-#endif
-/*
- * "normal" debug messages, but not with a normal DEBUG define ... way
- * too common name.
- */
-#ifdef SMBFS_DEBUG
-#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
-#else
-#define DEBUG1(f, a...) do { ; } while(0)
-#endif
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
deleted file mode 100644
index 0e39a924f10a..000000000000
--- a/fs/smbfs/smbiod.c
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- *  smbiod.c
- *
- *  Copyright (C) 2000, Charles Loep / Corel Corp.
- *  Copyright (C) 2001, Urban Widmark
- */
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/file.h>
-#include <linux/dcache.h>
-#include <linux/module.h>
-#include <linux/net.h>
-#include <linux/kthread.h>
-#include <net/ip.h>
-#include <linux/smb_fs.h>
-#include <linux/smbno.h>
-#include <linux/smb_mount.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include "smb_debug.h"
-#include "request.h"
-#include "proto.h"
-enum smbiod_state {
-        SMBIOD_DEAD,
-        SMBIOD_STARTING,
-        SMBIOD_RUNNING,
-};
-static enum smbiod_state smbiod_state = SMBIOD_DEAD;
-static struct task_struct *smbiod_thread;
-static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
-static LIST_HEAD(smb_servers);
-static DEFINE_SPINLOCK(servers_lock);
-#define SMBIOD_DATA_READY       (1<<0)
-static unsigned long smbiod_flags;
-static int smbiod(void *);
-static int smbiod_start(void);
-/*
- * called when there's work for us to do
- */
-void smbiod_wake_up(void)
-{
-        if (smbiod_state == SMBIOD_DEAD)
-                return;
-        set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-        wake_up_interruptible(&smbiod_wait);
-}
-/*
- * start smbiod if none is running
- */
-static int smbiod_start(void)
-{
-        struct task_struct *tsk;
-        int err = 0;
-        if (smbiod_state != SMBIOD_DEAD)
-                return 0;
-        smbiod_state = SMBIOD_STARTING;
-        __module_get(THIS_MODULE);
-        spin_unlock(&servers_lock);
-        tsk = kthread_run(smbiod, NULL, "smbiod");
-        if (IS_ERR(tsk)) {
-                err = PTR_ERR(tsk);
-                module_put(THIS_MODULE);
-        }
-        spin_lock(&servers_lock);
-        if (err < 0) {
-                smbiod_state = SMBIOD_DEAD;
-                smbiod_thread = NULL;
-        } else {
-                smbiod_state = SMBIOD_RUNNING;
-                smbiod_thread = tsk;
-        }
-        return err;
-}
-/*
- * register a server & start smbiod if necessary
- */
-int smbiod_register_server(struct smb_sb_info *server)
-{
-        int ret;
-        spin_lock(&servers_lock);
-        list_add(&server->entry, &smb_servers);
-        VERBOSE("%p\n", server);
-        ret = smbiod_start();
-        spin_unlock(&servers_lock);
-        return ret;
-}
-/*
- * Unregister a server
- * Must be called with the server lock held.
- */
-void smbiod_unregister_server(struct smb_sb_info *server)
-{
-        spin_lock(&servers_lock);
-        list_del_init(&server->entry);
-        VERBOSE("%p\n", server);
-        spin_unlock(&servers_lock);
-        smbiod_wake_up();
-        smbiod_flush(server);
-}
-void smbiod_flush(struct smb_sb_info *server)
-{
-        struct list_head *tmp, *n;
-        struct smb_request *req;
-        list_for_each_safe(tmp, n, &server->xmitq) {
-                req = list_entry(tmp, struct smb_request, rq_queue);
-                req->rq_errno = -EIO;
-                list_del_init(&req->rq_queue);
-                smb_rput(req);
-                wake_up_interruptible(&req->rq_wait);
-        }
-        list_for_each_safe(tmp, n, &server->recvq) {
-                req = list_entry(tmp, struct smb_request, rq_queue);
-                req->rq_errno = -EIO;
-                list_del_init(&req->rq_queue);
-                smb_rput(req);
-                wake_up_interruptible(&req->rq_wait);
-        }
-}
-/*
- * Wake up smbmount and make it reconnect to the server.
- * This must be called with the server locked.
- *
- * FIXME: add smbconnect version to this
- */
-int smbiod_retry(struct smb_sb_info *server)
-{
-        struct list_head *head;
-        struct smb_request *req;
-        struct pid *pid = get_pid(server->conn_pid);
-        int result = 0;
-        VERBOSE("state: %d\n", server->state);
-        if (server->state == CONN_VALID || server->state == CONN_RETRYING)
-                goto out;
-        smb_invalidate_inodes(server);
-        /*
-         * Some requests are meaningless after a retry, so we abort them.
-         * One example are all requests using 'fileid' since the files are
-         * closed on retry.
-         */
-        head = server->xmitq.next;
-        while (head != &server->xmitq) {
-                req = list_entry(head, struct smb_request, rq_queue);
-                head = head->next;
-                req->rq_bytes_sent = 0;
-                if (req->rq_flags & SMB_REQ_NORETRY) {
-                        VERBOSE("aborting request %p on xmitq\n", req);
-                        req->rq_errno = -EIO;
-                        list_del_init(&req->rq_queue);
-                        smb_rput(req);
-                        wake_up_interruptible(&req->rq_wait);
-                }
-        }
-        /*
-         * FIXME: test the code for retrying request we already sent
-         */
-        head = server->recvq.next;
-        while (head != &server->recvq) {
-                req = list_entry(head, struct smb_request, rq_queue);
-                head = head->next;
-#if 0
-                if (req->rq_flags & SMB_REQ_RETRY) {
-                        /* must move the request to the xmitq */
-                        VERBOSE("retrying request %p on recvq\n", req);
-                        list_move(&req->rq_queue, &server->xmitq);
-                        continue;
-                }
-#endif
-                VERBOSE("aborting request %p on recvq\n", req);
-                /* req->rq_rcls = ???; */ /* FIXME: set smb error code too? */
-                req->rq_errno = -EIO;
-                list_del_init(&req->rq_queue);
-                smb_rput(req);
-                wake_up_interruptible(&req->rq_wait);
-        }
-        smb_close_socket(server);
-        if (!pid) {
-                /* FIXME: this is fatal, umount? */
-                printk(KERN_ERR "smb_retry: no connection process\n");
-                server->state = CONN_RETRIED;
-                goto out;
-        }
-        /*
-         * Change state so that only one retry per server will be started.
-         */
-        server->state = CONN_RETRYING;
-        /*
-         * Note: use the "priv" flag, as a user process may need to reconnect.
-         */
-        result = kill_pid(pid, SIGUSR1, 1);
-        if (result) {
-                /* FIXME: this is most likely fatal, umount? */
-                printk(KERN_ERR "smb_retry: signal failed [%d]\n", result);
-                goto out;
-        }
-        VERBOSE("signalled pid %d\n", pid_nr(pid));
-        /* FIXME: The retried requests should perhaps get a "time boost". */
-out:
-        put_pid(pid);
-        return result;
-}
-/*
- * Currently handles lockingX packets.
- */
-static void smbiod_handle_request(struct smb_sb_info *server)
-{
-        PARANOIA("smbiod got a request ... and we don't implement oplocks!\n");
-        server->rstate = SMB_RECV_DROP;
-}
-/*
- * Do some IO for one server.
- */
-static void smbiod_doio(struct smb_sb_info *server)
-{
-        int result;
-        int maxwork = 7;
-        if (server->state != CONN_VALID)
-                goto out;
-        do {
-                result = smb_request_recv(server);
-                if (result < 0) {
-                        server->state = CONN_INVALID;
-                        smbiod_retry(server);
-                        goto out;       /* reconnecting is slow */
-                } else if (server->rstate == SMB_RECV_REQUEST)
-                        smbiod_handle_request(server);
-        } while (result > 0 && maxwork-- > 0);
-        /*
-         * If there is more to read then we want to be sure to wake up again.
-         */
-        if (server->state != CONN_VALID)
-                goto out;
-        if (smb_recv_available(server) > 0)
-                set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-        do {
-                result = smb_request_send_server(server);
-                if (result < 0) {
-                        server->state = CONN_INVALID;
-                        smbiod_retry(server);
-                        goto out;       /* reconnecting is slow */
-                }
-        } while (result > 0);
-        /*
-         * If the last request was not sent out we want to wake up again.
-         */
-        if (!list_empty(&server->xmitq))
-                set_bit(SMBIOD_DATA_READY, &smbiod_flags);
-out:
-        return;
-}
-/*
- * smbiod kernel thread
- */
-static int smbiod(void *unused)
-{
-        VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
-        for (;;) {
-                struct smb_sb_info *server;
-                struct list_head *pos, *n;
-                /* FIXME: Use poll? */
-                wait_event_interruptible(smbiod_wait,
-                         test_bit(SMBIOD_DATA_READY, &smbiod_flags));
-                if (signal_pending(current)) {
-                        spin_lock(&servers_lock);
-                        smbiod_state = SMBIOD_DEAD;
-                        spin_unlock(&servers_lock);
-                        break;
-                }
-                clear_bit(SMBIOD_DATA_READY, &smbiod_flags);
-                spin_lock(&servers_lock);
-                if (list_empty(&smb_servers)) {
-                        smbiod_state = SMBIOD_DEAD;
-                        spin_unlock(&servers_lock);
-                        break;
-                }
-                list_for_each_safe(pos, n, &smb_servers) {
-                        server = list_entry(pos, struct smb_sb_info, entry);
-                        VERBOSE("checking server %p\n", server);
-                        if (server->state == CONN_VALID) {
-                                spin_unlock(&servers_lock);
-                                smb_lock_server(server);
-                                smbiod_doio(server);
-                                smb_unlock_server(server);
-                                spin_lock(&servers_lock);
-                        }
-                }
-                spin_unlock(&servers_lock);
-        }
-        VERBOSE("SMB Kernel thread exiting (%d) ...\n", current->pid);
-        module_put_and_exit(0);
-}
diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c
deleted file mode 100644
index e37fe4deebd0..000000000000
--- a/fs/smbfs/sock.c
+++ /dev/null
@@ -1,386 +0,0 @@
-/*
- *  sock.c
- *
- *  Copyright (C) 1995, 1996 by Paal-Kr. Engstad and Volker Lendecke
- *  Copyright (C) 1997 by Volker Lendecke
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/socket.h>
-#include <linux/fcntl.h>
-#include <linux/file.h>
-#include <linux/in.h>
-#include <linux/net.h>
-#include <linux/mm.h>
-#include <linux/netdevice.h>
-#include <linux/workqueue.h>
-#include <net/scm.h>
-#include <net/tcp_states.h>
-#include <net/ip.h>
-#include <linux/smb_fs.h>
-#include <linux/smb.h>
-#include <linux/smbno.h>
-#include <asm/uaccess.h>
-#include <asm/ioctls.h>
-#include "smb_debug.h"
-#include "proto.h"
-#include "request.h"
-static int
-_recvfrom(struct socket *socket, unsigned char *ubuf, int size, unsigned flags)
-{
-        struct kvec iov = {ubuf, size};
-        struct msghdr msg = {.msg_flags = flags};
-        msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL;
-        return kernel_recvmsg(socket, &msg, &iov, 1, size, msg.msg_flags);
-}
-/*
- * Return the server this socket belongs to
- */
-static struct smb_sb_info *
-server_from_socket(struct socket *socket)
-{
-        return socket->sk->sk_user_data;
-}
-/*
- * Called when there is data on the socket.
- */
-void
-smb_data_ready(struct sock *sk, int len)
-{
-        struct smb_sb_info *server = server_from_socket(sk->sk_socket);
-        void (*data_ready)(struct sock *, int) = server->data_ready;
-        data_ready(sk, len);
-        VERBOSE("(%p, %d)\n", sk, len);
-        smbiod_wake_up();
-}
-int
-smb_valid_socket(struct inode * inode)
-{
-        return (inode && S_ISSOCK(inode->i_mode) && 
-                SOCKET_I(inode)->type == SOCK_STREAM);
-}
-static struct socket *
-server_sock(struct smb_sb_info *server)
-{
-        struct file *file;
-        if (server && (file = server->sock_file))
-        {
-#ifdef SMBFS_PARANOIA
-                if (!smb_valid_socket(file->f_path.dentry->d_inode))
-                        PARANOIA("bad socket!\n");
-#endif
-                return SOCKET_I(file->f_path.dentry->d_inode);
-        }
-        return NULL;
-}
-void
-smb_close_socket(struct smb_sb_info *server)
-{
-        struct file * file = server->sock_file;
-        if (file) {
-                struct socket *sock = server_sock(server);
-                VERBOSE("closing socket %p\n", sock);
-                sock->sk->sk_data_ready = server->data_ready;
-                server->sock_file = NULL;
-                fput(file);
-        }
-}
-static int
-smb_get_length(struct socket *socket, unsigned char *header)
-{
-        int result;
-        result = _recvfrom(socket, header, 4, MSG_PEEK);
-        if (result == -EAGAIN)
-                return -ENODATA;
-        if (result < 0) {
-                PARANOIA("recv error = %d\n", -result);
-                return result;
-        }
-        if (result < 4)
-                return -ENODATA;
-        switch (header[0]) {
-        case 0x00:
-        case 0x82:
-                break;
-        case 0x85:
-                DEBUG1("Got SESSION KEEP ALIVE\n");
-                _recvfrom(socket, header, 4, 0);        /* read away */
-                return -ENODATA;
-        default:
-                PARANOIA("Invalid NBT packet, code=%x\n", header[0]);
-                return -EIO;
-        }
-        /* The length in the RFC NB header is the raw data length */
-        return smb_len(header);
-}
-int
-smb_recv_available(struct smb_sb_info *server)
-{
-        mm_segment_t oldfs;
-        int avail, err;
-        struct socket *sock = server_sock(server);
-        oldfs = get_fs();
-        set_fs(get_ds());
-        err = sock->ops->ioctl(sock, SIOCINQ, (unsigned long) &avail);
-        set_fs(oldfs);
-        return (err >= 0) ? avail : err;
-}
-/*
- * Adjust the kvec to move on 'n' bytes (from nfs/sunrpc)
- */
-static int
-smb_move_iov(struct kvec **data, size_t *num, struct kvec *vec, unsigned amount)
-{
-        struct kvec *iv = *data;
-        int i;
-        int len;
-        /*
-         *      Eat any sent kvecs
-         */
-        while (iv->iov_len <= amount) {
-                amount -= iv->iov_len;
-                iv++;
-                (*num)--;
-        }
-        /*
-         *      And chew down the partial one
-         */
-        vec[0].iov_len = iv->iov_len-amount;
-        vec[0].iov_base =((unsigned char *)iv->iov_base)+amount;
-        iv++;
-        len = vec[0].iov_len;
-        /*
-         *      And copy any others
-         */
-        for (i = 1; i < *num; i++) {
-                vec[i] = *iv++;
-                len += vec[i].iov_len;
-        }
-        *data = vec;
-        return len;
-}
-/*
- * smb_receive_header
- * Only called by the smbiod thread.
- */
-int
-smb_receive_header(struct smb_sb_info *server)
-{
-        struct socket *sock;
-        int result = 0;
-        unsigned char peek_buf[4];
-        result = -EIO; 
-        sock = server_sock(server);
-        if (!sock)
-                goto out;
-        if (sock->sk->sk_state != TCP_ESTABLISHED)
-                goto out;
-        if (!server->smb_read) {
-                result = smb_get_length(sock, peek_buf);
-                if (result < 0) {
-                        if (result == -ENODATA)
-                                result = 0;
-                        goto out;
-                }
-                server->smb_len = result + 4;
-                if (server->smb_len < SMB_HEADER_LEN) {
-                        PARANOIA("short packet: %d\n", result);
-                        server->rstate = SMB_RECV_DROP;
-                        result = -EIO;
-                        goto out;
-                }
-                if (server->smb_len > SMB_MAX_PACKET_SIZE) {
-                        PARANOIA("long packet: %d\n", result);
-                        server->rstate = SMB_RECV_DROP;
-                        result = -EIO;
-                        goto out;
-                }
-        }
-        result = _recvfrom(sock, server->header + server->smb_read,
-                           SMB_HEADER_LEN - server->smb_read, 0);
-        VERBOSE("_recvfrom: %d\n", result);
-        if (result < 0) {
-                VERBOSE("receive error: %d\n", result);
-                goto out;
-        }
-        server->smb_read += result;
-        if (server->smb_read == SMB_HEADER_LEN)
-                server->rstate = SMB_RECV_HCOMPLETE;
-out:
-        return result;
-}
-static char drop_buffer[PAGE_SIZE];
-/*
- * smb_receive_drop - read and throw away the data
- * Only called by the smbiod thread.
- *
- * FIXME: we are in the kernel, could we just tell the socket that we want
- * to drop stuff from the buffer?
- */
-int
-smb_receive_drop(struct smb_sb_info *server)
-{
-        struct socket *sock;
-        unsigned int flags;
-        struct kvec iov;
-        struct msghdr msg;
-        int rlen = smb_len(server->header) - server->smb_read + 4;
-        int result = -EIO;
-        if (rlen > PAGE_SIZE)
-                rlen = PAGE_SIZE;
-        sock = server_sock(server);
-        if (!sock)
-                goto out;
-        if (sock->sk->sk_state != TCP_ESTABLISHED)
-                goto out;
-        flags = MSG_DONTWAIT | MSG_NOSIGNAL;
-        iov.iov_base = drop_buffer;
-        iov.iov_len = PAGE_SIZE;
-        msg.msg_flags = flags;
-        msg.msg_name = NULL;
-        msg.msg_namelen = 0;
-        msg.msg_control = NULL;
-        result = kernel_recvmsg(sock, &msg, &iov, 1, rlen, flags);
-        VERBOSE("read: %d\n", result);
-        if (result < 0) {
-                VERBOSE("receive error: %d\n", result);
-                goto out;
-        }
-        server->smb_read += result;
-        if (server->smb_read >= server->smb_len)
-                server->rstate = SMB_RECV_END;
-out:
-        return result;
-}
-/*
- * smb_receive
- * Only called by the smbiod thread.
- */
-int
-smb_receive(struct smb_sb_info *server, struct smb_request *req)
-{
-        struct socket *sock;
-        unsigned int flags;
-        struct kvec iov[4];
-        struct kvec *p = req->rq_iov;
-        size_t num = req->rq_iovlen;
-        struct msghdr msg;
-        int rlen;
-        int result = -EIO;
-        sock = server_sock(server);
-        if (!sock)
-                goto out;
-        if (sock->sk->sk_state != TCP_ESTABLISHED)
-                goto out;
-        flags = MSG_DONTWAIT | MSG_NOSIGNAL;
-        msg.msg_flags = flags;
-        msg.msg_name = NULL;
-        msg.msg_namelen = 0;
-        msg.msg_control = NULL;
-        /* Dont repeat bytes and count available bufferspace */
-        rlen = min_t(int, smb_move_iov(&p, &num, iov, req->rq_bytes_recvd),
-                        (req->rq_rlen - req->rq_bytes_recvd));
-        result = kernel_recvmsg(sock, &msg, p, num, rlen, flags);
-        VERBOSE("read: %d\n", result);
-        if (result < 0) {
-                VERBOSE("receive error: %d\n", result);
-                goto out;
-        }
-        req->rq_bytes_recvd += result;
-        server->smb_read += result;
-out:
-        return result;
-}
-/*
- * Try to send a SMB request. This may return after sending only parts of the
- * request. SMB_REQ_TRANSMITTED will be set if a request was fully sent.
- *
- * Parts of this was taken from xprt_sendmsg from net/sunrpc/xprt.c
- */
-int
-smb_send_request(struct smb_request *req)
-{
-        struct smb_sb_info *server = req->rq_server;
-        struct socket *sock;
-        struct msghdr msg = {.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT};
-        int slen = req->rq_slen - req->rq_bytes_sent;
-        int result = -EIO;
-        struct kvec iov[4];
-        struct kvec *p = req->rq_iov;
-        size_t num = req->rq_iovlen;
-        sock = server_sock(server);
-        if (!sock)
-                goto out;
-        if (sock->sk->sk_state != TCP_ESTABLISHED)
-                goto out;
-        /* Dont repeat bytes */
-        if (req->rq_bytes_sent)
-                smb_move_iov(&p, &num, iov, req->rq_bytes_sent);
-        result = kernel_sendmsg(sock, &msg, p, num, slen);
-        if (result >= 0) {
-                req->rq_bytes_sent += result;
-                if (req->rq_bytes_sent >= req->rq_slen)
-                        req->rq_flags |= SMB_REQ_TRANSMITTED;
-        }
-out:
-        return result;
-}
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
deleted file mode 100644
index 00b2909bd469..000000000000
--- a/fs/smbfs/symlink.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  symlink.c
- *
- *  Copyright (C) 2002 by John Newbigin
- *
- *  Please add a note about your changes to smbfs in the ChangeLog file.
- */
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/net.h>
-#include <linux/namei.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/smbno.h>
-#include <linux/smb_fs.h>
-#include "smb_debug.h"
-#include "proto.h"
-int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname)
-{
-        DEBUG1("create symlink %s -> %s/%s\n", oldname, DENTRY_PATH(dentry));
-        return smb_proc_symlink(server_from_dentry(dentry), dentry, oldname);
-}
-static void *smb_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-        char *link = __getname();
-        DEBUG1("followlink of %s/%s\n", DENTRY_PATH(dentry));
-        if (!link) {
-                link = ERR_PTR(-ENOMEM);
-        } else {
-                int len = smb_proc_read_link(server_from_dentry(dentry),
-                                                dentry, link, PATH_MAX - 1);
-                if (len < 0) {
-                        __putname(link);
-                        link = ERR_PTR(len);
-                } else {
-                        link[len] = 0;
-                }
-        }
-        nd_set_link(nd, link);
-        return NULL;
-}
-static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
-{
-        char *s = nd_get_link(nd);
-        if (!IS_ERR(s))
-                __putname(s);
-}
-const struct inode_operations smb_link_inode_operations =
-{
-        .readlink       = generic_readlink,
-        .follow_link    = smb_follow_link,
-        .put_link       = smb_put_link,
-};
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 12b933ac6585..0dc340aa2be9 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -230,5 +230,6 @@ failed_read:
 const struct file_operations squashfs_dir_ops = {
        .read = generic_read_dir,
-        .readdir = squashfs_readdir
+        .readdir = squashfs_readdir,
+        .llseek = default_llseek,
 };
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 88b4f8606652..24de30ba34c1 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -30,7 +30,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/mutex.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
@@ -354,8 +353,6 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 static void squashfs_put_super(struct super_block *sb)
 {
-        lock_kernel();
        if (sb->s_fs_info) {
                struct squashfs_sb_info *sbi = sb->s_fs_info;
                squashfs_cache_delete(sbi->block_cache);
@@ -370,17 +367,13 @@ static void squashfs_put_super(struct super_block *sb)
                kfree(sb->s_fs_info);
                sb->s_fs_info = NULL;
        }
-        unlock_kernel();
 }
-static int squashfs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags,
-                                const char *dev_name, void *data,
+                                const char *dev_name, void *data)
-                                struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
-                                mnt);
 }
@@ -456,7 +449,7 @@ static void squashfs_destroy_inode(struct inode *inode)
 static struct file_system_type squashfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "squashfs",
-        .get_sb = squashfs_get_sb,
+        .mount = squashfs_mount,
        .kill_sb = kill_block_super,
        .fs_flags = FS_REQUIRES_DEV
 };
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 652b8541f9c6..3876c36699a1 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -158,17 +158,18 @@ static int squashfs_xattr_get(struct inode *inode, int name_index,
                                        strncmp(target, name, name_size) == 0) {
                        /* found xattr */
                        if (type & SQUASHFS_XATTR_VALUE_OOL) {
-                                __le64 xattr;
+                                __le64 xattr_val;
+                                u64 xattr;
                                /* val is a reference to the real location */
                                err = squashfs_read_metadata(sb, &val, &start,
                                                &offset, sizeof(val));
                                if (err < 0)
                                        goto failed;
-                                err = squashfs_read_metadata(sb, &xattr, &start,
+                                err = squashfs_read_metadata(sb, &xattr_val,
-                                         &offset, sizeof(xattr));
+                                        &start, &offset, sizeof(xattr_val));
                                if (err < 0)
                                        goto failed;
-                                xattr = le64_to_cpu(xattr);
+                                xattr = le64_to_cpu(xattr_val);
                                start = SQUASHFS_XATTR_BLK(xattr) +
                                                        msblk->xattr_table;
                                offset = SQUASHFS_XATTR_OFFSET(xattr);
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index 49fe0d719fbf..b634efce4bde 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -25,7 +25,7 @@
 extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
                u64 *, int *);
 extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
-                int *, unsigned long long *);
+                unsigned int *, unsigned long long *);
 #else
 static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
                u64 start, u64 *xattr_table_start, int *xattr_ids)
@@ -35,7 +35,7 @@ static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
 }
 static inline int squashfs_xattr_lookup(struct super_block *sb,
-                unsigned int index, int *count, int *size,
+                unsigned int index, int *count, unsigned int *size,
                unsigned long long *xattr)
 {
        return 0;
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index cfb41106098f..d33be5dd6c32 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -34,6 +34,7 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 /*
 * Map xattr id using the xattr id look up table
diff --git a/fs/super.c b/fs/super.c
index 8819e3a7ff20..ca696155cd9a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -273,14 +273,14 @@ void generic_shutdown_super(struct super_block *sb)
                get_fs_excl();
                sb->s_flags &= ~MS_ACTIVE;
-                /* bad name - it should be evict_inodes() */
+                fsnotify_unmount_inodes(&sb->s_inodes);
-                invalidate_inodes(sb);
+                evict_inodes(sb);
                if (sop->put_super)
                        sop->put_super(sb);
-                /* Forget any remaining inodes */
+                if (!list_empty(&sb->s_inodes)) {
-                if (invalidate_inodes(sb)) {
                        printk("VFS: Busy inodes after unmount of %s. "
                           "Self-destruct in 5 seconds.  Have a nice day...\n",
                           sb->s_id);
@@ -715,15 +715,14 @@ static int ns_set_super(struct super_block *sb, void *data)
        return set_anon_super(sb, NULL);
 }
-int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
+struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
-        int (*fill_super)(struct super_block *, void *, int),
+        void *data, int (*fill_super)(struct super_block *, void *, int))
-        struct vfsmount *mnt)
 {
        struct super_block *sb;
        sb = sget(fs_type, ns_test_super, ns_set_super, data);
        if (IS_ERR(sb))
-                return PTR_ERR(sb);
+                return ERR_CAST(sb);
        if (!sb->s_root) {
                int err;
@@ -731,17 +730,16 @@ int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
                err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
                if (err) {
                        deactivate_locked_super(sb);
-                        return err;
+                        return ERR_PTR(err);
                }
                sb->s_flags |= MS_ACTIVE;
        }
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 }
-EXPORT_SYMBOL(get_sb_ns);
+EXPORT_SYMBOL(mount_ns);
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
@@ -762,10 +760,9 @@ static int test_bdev_super(struct super_block *s, void *data)
        return (void *)s->s_bdev == data;
 }
-int get_sb_bdev(struct file_system_type *fs_type,
+struct dentry *mount_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
+        int (*fill_super)(struct super_block *, void *, int))
-        struct vfsmount *mnt)
 {
        struct block_device *bdev;
        struct super_block *s;
@@ -777,7 +774,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
        bdev = open_bdev_exclusive(dev_name, mode, fs_type);
        if (IS_ERR(bdev))
-                return PTR_ERR(bdev);
+                return ERR_CAST(bdev);
        /*
         * once the super is inserted into the list by sget, s_umount
@@ -829,15 +826,30 @@ int get_sb_bdev(struct file_system_type *fs_type,
                bdev->bd_super = s;
        }
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 error_s:
        error = PTR_ERR(s);
 error_bdev:
        close_bdev_exclusive(bdev, mode);
 error:
-        return error;
+        return ERR_PTR(error);
+}
+EXPORT_SYMBOL(mount_bdev);
+int get_sb_bdev(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data,
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
+{
+        struct dentry *root;
+        root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
+        if (IS_ERR(root))
+                return PTR_ERR(root);
+        mnt->mnt_root = root;
+        mnt->mnt_sb = root->d_sb;
+        return 0;
 }
 EXPORT_SYMBOL(get_sb_bdev);
@@ -856,29 +868,42 @@ void kill_block_super(struct super_block *sb)
 EXPORT_SYMBOL(kill_block_super);
 #endif
-int get_sb_nodev(struct file_system_type *fs_type,
+struct dentry *mount_nodev(struct file_system_type *fs_type,
        int flags, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
+        int (*fill_super)(struct super_block *, void *, int))
-        struct vfsmount *mnt)
 {
        int error;
        struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
        if (IS_ERR(s))
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        s->s_flags = flags;
        error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
        if (error) {
                deactivate_locked_super(s);
-                return error;
+                return ERR_PTR(error);
        }
        s->s_flags |= MS_ACTIVE;
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
-        return 0;
 }
+EXPORT_SYMBOL(mount_nodev);
+int get_sb_nodev(struct file_system_type *fs_type,
+        int flags, void *data,
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
+{
+        struct dentry *root;
+        root = mount_nodev(fs_type, flags, data, fill_super);
+        if (IS_ERR(root))
+                return PTR_ERR(root);
+        mnt->mnt_root = root;
+        mnt->mnt_sb = root->d_sb;
+        return 0;
+}
 EXPORT_SYMBOL(get_sb_nodev);
 static int compare_single(struct super_block *s, void *p)
@@ -886,29 +911,42 @@ static int compare_single(struct super_block *s, void *p)
        return 1;
 }
-int get_sb_single(struct file_system_type *fs_type,
+struct dentry *mount_single(struct file_system_type *fs_type,
        int flags, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
+        int (*fill_super)(struct super_block *, void *, int))
-        struct vfsmount *mnt)
 {
        struct super_block *s;
        int error;
        s = sget(fs_type, compare_single, set_anon_super, NULL);
        if (IS_ERR(s))
-                return PTR_ERR(s);
+                return ERR_CAST(s);
        if (!s->s_root) {
                s->s_flags = flags;
                error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
                if (error) {
                        deactivate_locked_super(s);
-                        return error;
+                        return ERR_PTR(error);
                }
                s->s_flags |= MS_ACTIVE;
        } else {
                do_remount_sb(s, flags, data, 0);
        }
-        simple_set_mnt(mnt, s);
+        return dget(s->s_root);
+}
+EXPORT_SYMBOL(mount_single);
+int get_sb_single(struct file_system_type *fs_type,
+        int flags, void *data,
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
+{
+        struct dentry *root;
+        root = mount_single(fs_type, flags, data, fill_super);
+        if (IS_ERR(root))
+                return PTR_ERR(root);
+        mnt->mnt_root = root;
+        mnt->mnt_sb = root->d_sb;
        return 0;
 }
@@ -918,6 +956,7 @@ struct vfsmount *
 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
        struct vfsmount *mnt;
+        struct dentry *root;
        char *secdata = NULL;
        int error;
@@ -942,9 +981,19 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
                        goto out_free_secdata;
        }
-        error = type->get_sb(type, flags, name, data, mnt);
+        if (type->mount) {
-        if (error < 0)
+                root = type->mount(type, flags, name, data);
-                goto out_free_secdata;
+                if (IS_ERR(root)) {
+                        error = PTR_ERR(root);
+                        goto out_free_secdata;
+                }
+                mnt->mnt_root = root;
+                mnt->mnt_sb = root->d_sb;
+        } else {
+                error = type->get_sb(type, flags, name, data, mnt);
+                if (error < 0)
+                        goto out_free_secdata;
+        }
        BUG_ON(!mnt->mnt_sb);
        WARN_ON(!mnt->mnt_sb->s_bdi);
        mnt->mnt_sb->s_flags |= MS_BORN;
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 4e321f7353fa..a4759833d62d 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -179,30 +179,14 @@ static void bin_vma_open(struct vm_area_struct *vma)
        struct bin_buffer *bb = file->private_data;
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-        if (!bb->vm_ops || !bb->vm_ops->open)
+        if (!bb->vm_ops)
-                return;
-        if (!sysfs_get_active(attr_sd))
-                return;
-        bb->vm_ops->open(vma);
-        sysfs_put_active(attr_sd);
-}
-static void bin_vma_close(struct vm_area_struct *vma)
-{
-        struct file *file = vma->vm_file;
-        struct bin_buffer *bb = file->private_data;
-        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-        if (!bb->vm_ops || !bb->vm_ops->close)
                return;
        if (!sysfs_get_active(attr_sd))
                return;
-        bb->vm_ops->close(vma);
+        if (bb->vm_ops->open)
+                bb->vm_ops->open(vma);
        sysfs_put_active(attr_sd);
 }
@@ -214,13 +198,15 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        int ret;
-        if (!bb->vm_ops || !bb->vm_ops->fault)
+        if (!bb->vm_ops)
                return VM_FAULT_SIGBUS;
        if (!sysfs_get_active(attr_sd))
                return VM_FAULT_SIGBUS;
-        ret = bb->vm_ops->fault(vma, vmf);
+        ret = VM_FAULT_SIGBUS;
+        if (bb->vm_ops->fault)
+                ret = bb->vm_ops->fault(vma, vmf);
        sysfs_put_active(attr_sd);
        return ret;
@@ -236,13 +222,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (!bb->vm_ops)
                return VM_FAULT_SIGBUS;
-        if (!bb->vm_ops->page_mkwrite)
-                return 0;
        if (!sysfs_get_active(attr_sd))
                return VM_FAULT_SIGBUS;
-        ret = bb->vm_ops->page_mkwrite(vma, vmf);
+        ret = 0;
+        if (bb->vm_ops->page_mkwrite)
+                ret = bb->vm_ops->page_mkwrite(vma, vmf);
        sysfs_put_active(attr_sd);
        return ret;
@@ -256,13 +241,15 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        int ret;
-        if (!bb->vm_ops || !bb->vm_ops->access)
+        if (!bb->vm_ops)
                return -EINVAL;
        if (!sysfs_get_active(attr_sd))
                return -EINVAL;
-        ret = bb->vm_ops->access(vma, addr, buf, len, write);
+        ret = -EINVAL;
+        if (bb->vm_ops->access)
+                ret = bb->vm_ops->access(vma, addr, buf, len, write);
        sysfs_put_active(attr_sd);
        return ret;
@@ -276,13 +263,15 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        int ret;
-        if (!bb->vm_ops || !bb->vm_ops->set_policy)
+        if (!bb->vm_ops)
                return 0;
        if (!sysfs_get_active(attr_sd))
                return -EINVAL;
-        ret = bb->vm_ops->set_policy(vma, new);
+        ret = 0;
+        if (bb->vm_ops->set_policy)
+                ret = bb->vm_ops->set_policy(vma, new);
        sysfs_put_active(attr_sd);
        return ret;
@@ -296,13 +285,15 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        struct mempolicy *pol;
-        if (!bb->vm_ops || !bb->vm_ops->get_policy)
+        if (!bb->vm_ops)
                return vma->vm_policy;
        if (!sysfs_get_active(attr_sd))
                return vma->vm_policy;
-        pol = bb->vm_ops->get_policy(vma, addr);
+        pol = vma->vm_policy;
+        if (bb->vm_ops->get_policy)
+                pol = bb->vm_ops->get_policy(vma, addr);
        sysfs_put_active(attr_sd);
        return pol;
@@ -316,13 +307,15 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        int ret;
-        if (!bb->vm_ops || !bb->vm_ops->migrate)
+        if (!bb->vm_ops)
                return 0;
        if (!sysfs_get_active(attr_sd))
                return 0;
-        ret = bb->vm_ops->migrate(vma, from, to, flags);
+        ret = 0;
+        if (bb->vm_ops->migrate)
+                ret = bb->vm_ops->migrate(vma, from, to, flags);
        sysfs_put_active(attr_sd);
        return ret;
@@ -331,7 +324,6 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
 static const struct vm_operations_struct bin_vm_ops = {
        .open           = bin_vma_open,
-        .close          = bin_vma_close,
        .fault          = bin_fault,
        .page_mkwrite   = bin_page_mkwrite,
        .access         = bin_access,
@@ -377,6 +369,14 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
        if (bb->mmapped && bb->vm_ops != vma->vm_ops)
                goto out_put;
+        /*
+         * It is not possible to successfully wrap close.
+         * So error if someone is trying to use close.
+         */
+        rc = -EINVAL;
+        if (vma->vm_ops && vma->vm_ops->close)
+                goto out_put;
        rc = 0;
        bb->mmapped = 1;
        bb->vm_ops = vma->vm_ops;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f2af22574c50..266895783b47 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -23,7 +23,7 @@
 #include "sysfs.h"
-static struct vfsmount *sysfs_mount;
+static struct vfsmount *sysfs_mnt;
 struct kmem_cache *sysfs_dir_cachep;
 static const struct super_operations sysfs_ops = {
@@ -95,18 +95,17 @@ static int sysfs_set_super(struct super_block *sb, void *data)
        return error;
 }
-static int sysfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *sysfs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
        struct sysfs_super_info *info;
        enum kobj_ns_type type;
        struct super_block *sb;
        int error;
-        error = -ENOMEM;
        info = kzalloc(sizeof(*info), GFP_KERNEL);
        if (!info)
-                goto out;
+                return ERR_PTR(-ENOMEM);
        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
                info->ns[type] = kobj_ns_current(type);
@@ -114,24 +113,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type,
        sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
        if (IS_ERR(sb) || sb->s_fs_info != info)
                kfree(info);
-        if (IS_ERR(sb)) {
+        if (IS_ERR(sb))
-                error = PTR_ERR(sb);
+                return ERR_CAST(sb);
-                goto out;
-        }
        if (!sb->s_root) {
                sb->s_flags = flags;
                error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
                if (error) {
                        deactivate_locked_super(sb);
-                        goto out;
+                        return ERR_PTR(error);
                }
                sb->s_flags |= MS_ACTIVE;
        }
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        error = 0;
-out:
-        return error;
 }
 static void sysfs_kill_sb(struct super_block *sb)
@@ -147,7 +141,7 @@ static void sysfs_kill_sb(struct super_block *sb)
 static struct file_system_type sysfs_fs_type = {
        .name           = "sysfs",
-        .get_sb         = sysfs_get_sb,
+        .mount          = sysfs_mount,
        .kill_sb        = sysfs_kill_sb,
 };
@@ -189,11 +183,11 @@ int __init sysfs_init(void)
        err = register_filesystem(&sysfs_fs_type);
        if (!err) {
-                sysfs_mount = kern_mount(&sysfs_fs_type);
+                sysfs_mnt = kern_mount(&sysfs_fs_type);
-                if (IS_ERR(sysfs_mount)) {
+                if (IS_ERR(sysfs_mnt)) {
                        printk(KERN_ERR "sysfs: could not mount!\n");
-                        err = PTR_ERR(sysfs_mount);
+                        err = PTR_ERR(sysfs_mnt);
-                        sysfs_mount = NULL;
+                        sysfs_mnt = NULL;
                        unregister_filesystem(&sysfs_fs_type);
                        goto out_err;
                }
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 33e047b59b8d..11e7f7d11cd0 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -126,7 +126,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        return add_nondir(dentry, inode);
 }
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index a0b0cda6927e..3d9c62be0c10 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -526,23 +526,22 @@ failed:
 /* Every kernel module contains stuff like this. */
-static int sysv_get_sb(struct file_system_type *fs_type,
+static struct dentry *sysv_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
-                           mnt);
 }
-static int v7_get_sb(struct file_system_type *fs_type,
+static struct dentry *v7_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super);
 }
 static struct file_system_type sysv_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "sysv",
-        .get_sb         = sysv_get_sb,
+        .mount          = sysv_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -550,7 +549,7 @@ static struct file_system_type sysv_fs_type = {
 static struct file_system_type v7_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "v7",
-        .get_sb         = v7_get_sb,
+        .mount          = v7_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b86ab8eff79a..8c4fc1425b3e 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -144,6 +144,7 @@ static const struct file_operations timerfd_fops = {
        .release        = timerfd_release,
        .poll           = timerfd_poll,
        .read           = timerfd_read,
+        .llseek         = noop_llseek,
 };
 static struct file *timerfd_fget(int fd)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 37fa7ed062d8..02429d81ca33 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -63,7 +63,9 @@ static int do_commit(struct ubifs_info *c)
        struct ubifs_lp_stats lst;
        dbg_cmt("start");
-        if (c->ro_media) {
+        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->ro_error) {
                err = -EROFS;
                goto out_up;
        }
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index c2a68baa782f..0bee4dbffc31 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2239,6 +2239,162 @@ out_free:
        return err;
 }
+/**
+ * dbg_check_data_nodes_order - check that list of data nodes is sorted.
+ * @c: UBIFS file-system description object
+ * @head: the list of nodes ('struct ubifs_scan_node' objects)
+ *
+ * This function returns zero if the list of data nodes is sorted correctly,
+ * and %-EINVAL if not.
+ */
+int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
+{
+        struct list_head *cur;
+        struct ubifs_scan_node *sa, *sb;
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+                return 0;
+        for (cur = head->next; cur->next != head; cur = cur->next) {
+                ino_t inuma, inumb;
+                uint32_t blka, blkb;
+                cond_resched();
+                sa = container_of(cur, struct ubifs_scan_node, list);
+                sb = container_of(cur->next, struct ubifs_scan_node, list);
+                if (sa->type != UBIFS_DATA_NODE) {
+                        ubifs_err("bad node type %d", sa->type);
+                        dbg_dump_node(c, sa->node);
+                        return -EINVAL;
+                }
+                if (sb->type != UBIFS_DATA_NODE) {
+                        ubifs_err("bad node type %d", sb->type);
+                        dbg_dump_node(c, sb->node);
+                        return -EINVAL;
+                }
+                inuma = key_inum(c, &sa->key);
+                inumb = key_inum(c, &sb->key);
+                if (inuma < inumb)
+                        continue;
+                if (inuma > inumb) {
+                        ubifs_err("larger inum %lu goes before inum %lu",
+                                  (unsigned long)inuma, (unsigned long)inumb);
+                        goto error_dump;
+                }
+                blka = key_block(c, &sa->key);
+                blkb = key_block(c, &sb->key);
+                if (blka > blkb) {
+                        ubifs_err("larger block %u goes before %u", blka, blkb);
+                        goto error_dump;
+                }
+                if (blka == blkb) {
+                        ubifs_err("two data nodes for the same block");
+                        goto error_dump;
+                }
+        }
+        return 0;
+error_dump:
+        dbg_dump_node(c, sa->node);
+        dbg_dump_node(c, sb->node);
+        return -EINVAL;
+}
+/**
+ * dbg_check_nondata_nodes_order - check that list of data nodes is sorted.
+ * @c: UBIFS file-system description object
+ * @head: the list of nodes ('struct ubifs_scan_node' objects)
+ *
+ * This function returns zero if the list of non-data nodes is sorted correctly,
+ * and %-EINVAL if not.
+ */
+int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
+{
+        struct list_head *cur;
+        struct ubifs_scan_node *sa, *sb;
+        if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+                return 0;
+        for (cur = head->next; cur->next != head; cur = cur->next) {
+                ino_t inuma, inumb;
+                uint32_t hasha, hashb;
+                cond_resched();
+                sa = container_of(cur, struct ubifs_scan_node, list);
+                sb = container_of(cur->next, struct ubifs_scan_node, list);
+                if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
+                    sa->type != UBIFS_XENT_NODE) {
+                        ubifs_err("bad node type %d", sa->type);
+                        dbg_dump_node(c, sa->node);
+                        return -EINVAL;
+                }
+                if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
+                    sa->type != UBIFS_XENT_NODE) {
+                        ubifs_err("bad node type %d", sb->type);
+                        dbg_dump_node(c, sb->node);
+                        return -EINVAL;
+                }
+                if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
+                        ubifs_err("non-inode node goes before inode node");
+                        goto error_dump;
+                }
+                if (sa->type == UBIFS_INO_NODE && sb->type != UBIFS_INO_NODE)
+                        continue;
+                if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
+                        /* Inode nodes are sorted in descending size order */
+                        if (sa->len < sb->len) {
+                                ubifs_err("smaller inode node goes first");
+                                goto error_dump;
+                        }
+                        continue;
+                }
+                /*
+                 * This is either a dentry or xentry, which should be sorted in
+                 * ascending (parent ino, hash) order.
+                 */
+                inuma = key_inum(c, &sa->key);
+                inumb = key_inum(c, &sb->key);
+                if (inuma < inumb)
+                        continue;
+                if (inuma > inumb) {
+                        ubifs_err("larger inum %lu goes before inum %lu",
+                                  (unsigned long)inuma, (unsigned long)inumb);
+                        goto error_dump;
+                }
+                hasha = key_block(c, &sa->key);
+                hashb = key_block(c, &sb->key);
+                if (hasha > hashb) {
+                        ubifs_err("larger hash %u goes before %u", hasha, hashb);
+                        goto error_dump;
+                }
+        }
+        return 0;
+error_dump:
+        ubifs_msg("dumping first node");
+        dbg_dump_node(c, sa->node);
+        ubifs_msg("dumping second node");
+        dbg_dump_node(c, sb->node);
+        return -EINVAL;
+        return 0;
+}
 static int invocation_cnt;
 int dbg_force_in_the_gaps(void)
@@ -2625,6 +2781,7 @@ static const struct file_operations dfs_fops = {
        .open = open_debugfs_file,
        .write = write_debugfs_file,
        .owner = THIS_MODULE,
+        .llseek = default_llseek,
 };
 /**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 29d960101ea6..69ebe4729151 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -324,6 +324,8 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
                        int row, int col);
 int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
                         loff_t size);
+int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
+int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
 /* Force the use of in-the-gaps method for testing */
@@ -465,6 +467,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 #define dbg_check_lprops(c)                        0
 #define dbg_check_lpt_nodes(c, cnode, row, col)    0
 #define dbg_check_inode_size(c, inode, size)       0
+#define dbg_check_data_nodes_order(c, head)        0
+#define dbg_check_nondata_nodes_order(c, head)     0
 #define dbg_force_in_the_gaps_enabled              0
 #define dbg_force_in_the_gaps()                    0
 #define dbg_failure_mode                           0
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 87ebcce72213..14f64b689d7f 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -550,7 +550,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
        lock_2_inodes(dir, inode);
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        inode->i_ctime = ubifs_current_time(inode);
        dir->i_size += sz_change;
        dir_ui->ui_size = dir->i_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 03ae894c45de..d77db7e36484 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -433,8 +433,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        struct page *page;
        ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
+        ubifs_assert(!c->ro_media && !c->ro_mount);
-        if (unlikely(c->ro_media))
+        if (unlikely(c->ro_error))
                return -EROFS;
        /* Try out the fast-path part first */
@@ -1439,9 +1440,9 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vm
        dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
                i_size_read(inode));
-        ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
+        ubifs_assert(!c->ro_media && !c->ro_mount);
-        if (unlikely(c->ro_media))
+        if (unlikely(c->ro_error))
                return VM_FAULT_SIGBUS; /* -EROFS */
        /*
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 918d1582ca05..151f10882820 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -125,10 +125,16 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
        struct ubifs_scan_node *sa, *sb;
        cond_resched();
+        if (a == b)
+                return 0;
        sa = list_entry(a, struct ubifs_scan_node, list);
        sb = list_entry(b, struct ubifs_scan_node, list);
        ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
        ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+        ubifs_assert(sa->type == UBIFS_DATA_NODE);
+        ubifs_assert(sb->type == UBIFS_DATA_NODE);
        inuma = key_inum(c, &sa->key);
        inumb = key_inum(c, &sb->key);
@@ -157,28 +163,40 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 */
 int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
-        int typea, typeb;
        ino_t inuma, inumb;
        struct ubifs_info *c = priv;
        struct ubifs_scan_node *sa, *sb;
        cond_resched();
+        if (a == b)
+                return 0;
        sa = list_entry(a, struct ubifs_scan_node, list);
        sb = list_entry(b, struct ubifs_scan_node, list);
-        typea = key_type(c, &sa->key);
-        typeb = key_type(c, &sb->key);
+        ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY &&
-        ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
+                     key_type(c, &sb->key) != UBIFS_DATA_KEY);
+        ubifs_assert(sa->type != UBIFS_DATA_NODE &&
+                     sb->type != UBIFS_DATA_NODE);
        /* Inodes go before directory entries */
-        if (typea == UBIFS_INO_KEY) {
+        if (sa->type == UBIFS_INO_NODE) {
-                if (typeb == UBIFS_INO_KEY)
+                if (sb->type == UBIFS_INO_NODE)
                        return sb->len - sa->len;
                return -1;
        }
-        if (typeb == UBIFS_INO_KEY)
+        if (sb->type == UBIFS_INO_NODE)
                return 1;
-        ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
+        ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY ||
+                     key_type(c, &sa->key) == UBIFS_XENT_KEY);
+        ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY ||
+                     key_type(c, &sb->key) == UBIFS_XENT_KEY);
+        ubifs_assert(sa->type == UBIFS_DENT_NODE ||
+                     sa->type == UBIFS_XENT_NODE);
+        ubifs_assert(sb->type == UBIFS_DENT_NODE ||
+                     sb->type == UBIFS_XENT_NODE);
        inuma = key_inum(c, &sa->key);
        inumb = key_inum(c, &sb->key);
@@ -224,17 +242,33 @@ int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
 static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
                      struct list_head *nondata, int *min)
 {
+        int err;
        struct ubifs_scan_node *snod, *tmp;
        *min = INT_MAX;
        /* Separate data nodes and non-data nodes */
        list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
-                int err;
+                ubifs_assert(snod->type == UBIFS_INO_NODE  ||
+                             snod->type == UBIFS_DATA_NODE ||
+                             snod->type == UBIFS_DENT_NODE ||
+                             snod->type == UBIFS_XENT_NODE ||
+                             snod->type == UBIFS_TRUN_NODE);
+                if (snod->type != UBIFS_INO_NODE  &&
+                    snod->type != UBIFS_DATA_NODE &&
+                    snod->type != UBIFS_DENT_NODE &&
+                    snod->type != UBIFS_XENT_NODE) {
+                        /* Probably truncation node, zap it */
+                        list_del(&snod->list);
+                        kfree(snod);
+                        continue;
+                }
-                ubifs_assert(snod->type != UBIFS_IDX_NODE);
+                ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY ||
-                ubifs_assert(snod->type != UBIFS_REF_NODE);
+                             key_type(c, &snod->key) == UBIFS_INO_KEY  ||
-                ubifs_assert(snod->type != UBIFS_CS_NODE);
+                             key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+                             key_type(c, &snod->key) == UBIFS_XENT_KEY);
                err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
                                         snod->offs, 0);
@@ -258,6 +292,13 @@ static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
        /* Sort data and non-data nodes */
        list_sort(c, &sleb->nodes, &data_nodes_cmp);
        list_sort(c, nondata, &nondata_nodes_cmp);
+        err = dbg_check_data_nodes_order(c, &sleb->nodes);
+        if (err)
+                return err;
+        err = dbg_check_nondata_nodes_order(c, nondata);
+        if (err)
+                return err;
        return 0;
 }
@@ -575,13 +616,14 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
        ubifs_assert_cmt_locked(c);
+        ubifs_assert(!c->ro_media && !c->ro_mount);
        if (ubifs_gc_should_commit(c))
                return -EAGAIN;
        mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
-        if (c->ro_media) {
+        if (c->ro_error) {
                ret = -EROFS;
                goto out_unlock;
        }
@@ -677,14 +719,12 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
                ret = ubifs_garbage_collect_leb(c, &lp);
                if (ret < 0) {
-                        if (ret == -EAGAIN || ret == -ENOSPC) {
+                        if (ret == -EAGAIN) {
                                /*
-                                 * These codes are not errors, so we have to
+                                 * This is not error, so we have to return the
-                                 * return the LEB to lprops. But if the
+                                 * LEB to lprops. But if 'ubifs_return_leb()'
-                                 * 'ubifs_return_leb()' function fails, its
+                                 * fails, its failure code is propagated to the
-                                 * failure code is propagated to the caller
+                                 * caller instead of the original '-EAGAIN'.
-                                 * instead of the original '-EAGAIN' or
-                                 * '-ENOSPC'.
                                 */
                                err = ubifs_return_leb(c, lp.lnum);
                                if (err)
@@ -774,8 +814,8 @@ out_unlock:
 out:
        ubifs_assert(ret < 0);
        ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
-        ubifs_ro_mode(c, ret);
        ubifs_wbuf_sync_nolock(wbuf);
+        ubifs_ro_mode(c, ret);
        mutex_unlock(&wbuf->io_mutex);
        ubifs_return_leb(c, lp.lnum);
        return ret;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index bcf5a16f30bb..d82173182eeb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -61,8 +61,8 @@
 */
 void ubifs_ro_mode(struct ubifs_info *c, int err)
 {
-        if (!c->ro_media) {
+        if (!c->ro_error) {
-                c->ro_media = 1;
+                c->ro_error = 1;
                c->no_chk_data_crc = 0;
                c->vfs_sb->s_flags |= MS_RDONLY;
                ubifs_warn("switched to read-only mode, error %d", err);
@@ -356,11 +356,11 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
        dbg_io("LEB %d:%d, %d bytes, jhead %s",
               wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
-        ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
        ubifs_assert(!(wbuf->avail & 7));
        ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
+        ubifs_assert(!c->ro_media && !c->ro_mount);
-        if (c->ro_media)
+        if (c->ro_error)
                return -EROFS;
        ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
@@ -440,11 +440,12 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c)
 {
        int err, i;
+        ubifs_assert(!c->ro_media && !c->ro_mount);
        if (!c->need_wbuf_sync)
                return 0;
        c->need_wbuf_sync = 0;
-        if (c->ro_media) {
+        if (c->ro_error) {
                err = -EROFS;
                goto out_timers;
        }
@@ -519,6 +520,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
        ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
        ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
+        ubifs_assert(!c->ro_media && !c->ro_mount);
        if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
                err = -ENOSPC;
@@ -527,7 +529,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        cancel_wbuf_timer_nolock(wbuf);
-        if (c->ro_media)
+        if (c->ro_error)
                return -EROFS;
        if (aligned_len <= wbuf->avail) {
@@ -663,8 +665,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
               buf_len);
        ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
        ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
+        ubifs_assert(!c->ro_media && !c->ro_mount);
-        if (c->ro_media)
+        if (c->ro_error)
                return -EROFS;
        ubifs_prepare_node(c, buf, len, 1);
@@ -815,7 +818,8 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
        return 0;
 out:
-        ubifs_err("bad node at LEB %d:%d", lnum, offs);
+        ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs,
+                  ubi_is_mapped(c->ubi, lnum));
        dbg_dump_node(c, buf);
        dbg_dump_stack();
        return -EINVAL;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index d321baeca68d..914f1bd89e57 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -122,11 +122,12 @@ static int reserve_space(struct ubifs_info *c, int jhead, int len)
         * better to try to allocate space at the ends of eraseblocks. This is
         * what the squeeze parameter does.
         */
+        ubifs_assert(!c->ro_media && !c->ro_mount);
        squeeze = (jhead == BASEHD);
 again:
        mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
-        if (c->ro_media) {
+        if (c->ro_error) {
                err = -EROFS;
                goto out_unlock;
        }
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 0f530c684f0b..92a8491a8f8c 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -306,6 +306,20 @@ static inline void trun_key_init(const struct ubifs_info *c,
 }
 /**
+ * invalid_key_init - initialize invalid node key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ *
+ * This is a helper function which marks a @key object as invalid.
+ */
+static inline void invalid_key_init(const struct ubifs_info *c,
+                                    union ubifs_key *key)
+{
+        key->u32[0] = 0xDEADBEAF;
+        key->u32[1] = UBIFS_INVALID_KEY;
+}
+/**
 * key_type - get key type.
 * @c: UBIFS file-system description object
 * @key: key to get type of
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index c345e125f42c..4d0cb1241460 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -159,7 +159,7 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
                jhead = &c->jheads[bud->jhead];
                list_add_tail(&bud->list, &jhead->buds_list);
        } else
-                ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY));
+                ubifs_assert(c->replaying && c->ro_mount);
        /*
         * Note, although this is a new bud, we anyway account this space now,
@@ -223,8 +223,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
        }
        mutex_lock(&c->log_mutex);
+        ubifs_assert(!c->ro_media && !c->ro_mount);
-        if (c->ro_media) {
+        if (c->ro_error) {
                err = -EROFS;
                goto out_unlock;
        }
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 0084a33c4c69..72775d35b99e 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1363,6 +1363,7 @@ static int read_lsave(struct ubifs_info *c)
                goto out;
        for (i = 0; i < c->lsave_cnt; i++) {
                int lnum = c->lsave[i];
+                struct ubifs_lprops *lprops;
                /*
                 * Due to automatic resizing, the values in the lsave table
@@ -1370,7 +1371,11 @@ static int read_lsave(struct ubifs_info *c)
                 */
                if (lnum >= c->leb_cnt)
                        continue;
-                ubifs_lpt_lookup(c, lnum);
+                lprops = ubifs_lpt_lookup(c, lnum);
+                if (IS_ERR(lprops)) {
+                        err = PTR_ERR(lprops);
+                        goto out;
+                }
        }
 out:
        vfree(buf);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index d12535b7fc78..5c90dec5db0b 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -705,6 +705,9 @@ static int make_tree_dirty(struct ubifs_info *c)
        struct ubifs_pnode *pnode;
        pnode = pnode_lookup(c, 0);
+        if (IS_ERR(pnode))
+                return PTR_ERR(pnode);
        while (pnode) {
                do_make_pnode_dirty(c, pnode);
                pnode = next_pnode_to_dirty(c, pnode);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 28beaeedadc0..21f47afdacff 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -361,7 +361,8 @@ int ubifs_write_master(struct ubifs_info *c)
 {
        int err, lnum, offs, len;
-        if (c->ro_media)
+        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->ro_error)
                return -EROFS;
        lnum = UBIFS_MST_LNUM;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4fa81d867e41..c3de04dc952a 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -132,7 +132,8 @@ static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
 {
        int err;
-        if (c->ro_media)
+        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->ro_error)
                return -EROFS;
        err = ubi_leb_unmap(c->ubi, lnum);
        if (err) {
@@ -159,7 +160,8 @@ static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
 {
        int err;
-        if (c->ro_media)
+        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->ro_error)
                return -EROFS;
        err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
        if (err) {
@@ -186,7 +188,8 @@ static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
 {
        int err;
-        if (c->ro_media)
+        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->ro_error)
                return -EROFS;
        err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
        if (err) {
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index daae9e1f5382..77e9b874b6c2 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -292,7 +292,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
        memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
-        if ((c->vfs_sb->s_flags & MS_RDONLY)) {
+        if (c->ro_mount) {
                /* Read-only mode. Keep a copy for switching to rw mode */
                c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
                if (!c->rcvrd_mst_node) {
@@ -469,7 +469,7 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
                endpt = snod->offs + snod->len;
        }
-        if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) {
+        if (c->ro_mount && !c->remounting_rw) {
                /* Add to recovery list */
                struct ubifs_unclean_leb *ucleb;
@@ -772,7 +772,8 @@ out_free:
 * @sbuf: LEB-sized buffer to use
 *
 * This function does a scan of a LEB, but caters for errors that might have
- * been caused by the unclean unmount from which we are attempting to recover.
+ * been caused by unclean reboots from which we are attempting to recover
+ * (assume that only the last log LEB can be corrupted by an unclean reboot).
 *
 * This function returns %0 on success and a negative error code on failure.
 */
@@ -883,7 +884,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
 {
        int err;
-        ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw);
+        ubifs_assert(!c->ro_mount || c->remounting_rw);
        dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
        err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
@@ -1461,7 +1462,7 @@ int ubifs_recover_size(struct ubifs_info *c)
                        }
                }
                if (e->exists && e->i_size < e->d_size) {
-                        if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) {
+                        if (!e->inode && c->ro_mount) {
                                /* Fix the inode size and pin it in memory */
                                struct inode *inode;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 5c2d6d759a3e..eed0fcff8d73 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -627,8 +627,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
        ubifs_assert(sleb->endpt - offs >= used);
        ubifs_assert(sleb->endpt % c->min_io_size == 0);
-        if (sleb->endpt + c->min_io_size <= c->leb_size &&
+        if (sleb->endpt + c->min_io_size <= c->leb_size && !c->ro_mount)
-            !(c->vfs_sb->s_flags & MS_RDONLY))
                err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
                                             sleb->endpt, UBI_SHORTTERM);
@@ -840,6 +839,11 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
        if (IS_ERR(sleb)) {
                if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
                        return PTR_ERR(sleb);
+                /*
+                 * Note, the below function will recover this log LEB only if
+                 * it is the last, because unclean reboots can possibly corrupt
+                 * only the tail of the log.
+                 */
                sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
                if (IS_ERR(sleb))
                        return PTR_ERR(sleb);
@@ -851,7 +855,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
        }
        node = sleb->buf;
        snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
        if (c->cs_sqnum == 0) {
                /*
@@ -898,7 +901,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
        }
        list_for_each_entry(snod, &sleb->nodes, list) {
                cond_resched();
                if (snod->sqnum >= SQNUM_WATERMARK) {
@@ -1011,7 +1013,6 @@ out:
 int ubifs_replay_journal(struct ubifs_info *c)
 {
        int err, i, lnum, offs, free;
-        void *sbuf = NULL;
        BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
@@ -1026,14 +1027,8 @@ int ubifs_replay_journal(struct ubifs_info *c)
                return -EINVAL;
        }
-        sbuf = vmalloc(c->leb_size);
-        if (!sbuf)
-                return -ENOMEM;
        dbg_mnt("start replaying the journal");
        c->replaying = 1;
        lnum = c->ltail_lnum = c->lhead_lnum;
        offs = c->lhead_offs;
@@ -1046,7 +1041,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
                        lnum = UBIFS_LOG_LNUM;
                        offs = 0;
                }
-                err = replay_log_leb(c, lnum, offs, sbuf);
+                err = replay_log_leb(c, lnum, offs, c->sbuf);
                if (err == 1)
                        /* We hit the end of the log */
                        break;
@@ -1079,7 +1074,6 @@ int ubifs_replay_journal(struct ubifs_info *c)
 out:
        destroy_replay_tree(c);
        destroy_bud_list(c);
-        vfree(sbuf);
        c->replaying = 0;
        return err;
 }
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 96cb62c8a9dd..bf31b4729e51 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -542,11 +542,8 @@ int ubifs_read_superblock(struct ubifs_info *c)
         * due to the unavailability of time-travelling equipment.
         */
        if (c->fmt_version > UBIFS_FORMAT_VERSION) {
-                struct super_block *sb = c->vfs_sb;
+                ubifs_assert(!c->ro_media || c->ro_mount);
-                int mounting_ro = sb->s_flags & MS_RDONLY;
+                if (!c->ro_mount ||
-                ubifs_assert(!c->ro_media || mounting_ro);
-                if (!mounting_ro ||
                    c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
                        ubifs_err("on-flash format version is w%d/r%d, but "
                                  "software only supports up to version "
@@ -624,7 +621,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
        c->old_leb_cnt = c->leb_cnt;
        if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
                c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
-                if (c->vfs_sb->s_flags & MS_RDONLY)
+                if (c->ro_mount)
                        dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
                                c->old_leb_cnt, c->leb_cnt);
                else {
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 96c525384191..3e1ee57dbeaa 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -197,7 +197,7 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
        struct ubifs_ino_node *ino = buf;
        struct ubifs_scan_node *snod;
-        snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
+        snod = kmalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
        if (!snod)
                return -ENOMEM;
@@ -212,13 +212,15 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
        case UBIFS_DENT_NODE:
        case UBIFS_XENT_NODE:
        case UBIFS_DATA_NODE:
-        case UBIFS_TRUN_NODE:
                /*
                 * The key is in the same place in all keyed
                 * nodes.
                 */
                key_read(c, &ino->key, &snod->key);
                break;
+        default:
+                invalid_key_init(c, &snod->key);
+                break;
        }
        list_add_tail(&snod->list, &sleb->nodes);
        sleb->nodes_cnt += 1;
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 0b201114a5ad..46961c003236 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -250,7 +250,7 @@ static int kick_a_thread(void)
                        dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
                        if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
-                            c->ro_media) {
+                            c->ro_mount || c->ro_error) {
                                mutex_unlock(&c->umount_mutex);
                                continue;
                        }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index cd5900b85d38..91fac54c70e3 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1137,11 +1137,11 @@ static int check_free_space(struct ubifs_info *c)
 */
 static int mount_ubifs(struct ubifs_info *c)
 {
-        struct super_block *sb = c->vfs_sb;
+        int err;
-        int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
        long long x;
        size_t sz;
+        c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
        err = init_constants_early(c);
        if (err)
                return err;
@@ -1154,7 +1154,7 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_free;
-        if (c->empty && (mounted_read_only || c->ro_media)) {
+        if (c->empty && (c->ro_mount || c->ro_media)) {
                /*
                 * This UBI volume is empty, and read-only, or the file system
                 * is mounted read-only - we cannot format it.
@@ -1165,7 +1165,7 @@ static int mount_ubifs(struct ubifs_info *c)
                goto out_free;
        }
-        if (c->ro_media && !mounted_read_only) {
+        if (c->ro_media && !c->ro_mount) {
                ubifs_err("cannot mount read-write - read-only media");
                err = -EROFS;
                goto out_free;
@@ -1185,7 +1185,7 @@ static int mount_ubifs(struct ubifs_info *c)
        if (!c->sbuf)
                goto out_free;
-        if (!mounted_read_only) {
+        if (!c->ro_mount) {
                c->ileb_buf = vmalloc(c->leb_size);
                if (!c->ileb_buf)
                        goto out_free;
@@ -1228,7 +1228,7 @@ static int mount_ubifs(struct ubifs_info *c)
        }
        sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
-        if (!mounted_read_only) {
+        if (!c->ro_mount) {
                err = alloc_wbufs(c);
                if (err)
                        goto out_cbuf;
@@ -1254,12 +1254,12 @@ static int mount_ubifs(struct ubifs_info *c)
        if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
                ubifs_msg("recovery needed");
                c->need_recovery = 1;
-                if (!mounted_read_only) {
+                if (!c->ro_mount) {
                        err = ubifs_recover_inl_heads(c, c->sbuf);
                        if (err)
                                goto out_master;
                }
-        } else if (!mounted_read_only) {
+        } else if (!c->ro_mount) {
                /*
                 * Set the "dirty" flag so that if we reboot uncleanly we
                 * will notice this immediately on the next mount.
@@ -1270,7 +1270,7 @@ static int mount_ubifs(struct ubifs_info *c)
                        goto out_master;
        }
-        err = ubifs_lpt_init(c, 1, !mounted_read_only);
+        err = ubifs_lpt_init(c, 1, !c->ro_mount);
        if (err)
                goto out_lpt;
@@ -1285,11 +1285,11 @@ static int mount_ubifs(struct ubifs_info *c)
        /* Calculate 'min_idx_lebs' after journal replay */
        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
-        err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);
+        err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
        if (err)
                goto out_orphans;
-        if (!mounted_read_only) {
+        if (!c->ro_mount) {
                int lnum;
                err = check_free_space(c);
@@ -1351,7 +1351,7 @@ static int mount_ubifs(struct ubifs_info *c)
        spin_unlock(&ubifs_infos_lock);
        if (c->need_recovery) {
-                if (mounted_read_only)
+                if (c->ro_mount)
                        ubifs_msg("recovery deferred");
                else {
                        c->need_recovery = 0;
@@ -1378,7 +1378,7 @@ static int mount_ubifs(struct ubifs_info *c)
        ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
                  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
-        if (mounted_read_only)
+        if (c->ro_mount)
                ubifs_msg("mounted read-only");
        x = (long long)c->main_lebs * c->leb_size;
        ubifs_msg("file system size:   %lld bytes (%lld KiB, %lld MiB, %d "
@@ -1640,7 +1640,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        }
        dbg_gen("re-mounted read-write");
-        c->vfs_sb->s_flags &= ~MS_RDONLY;
+        c->ro_mount = 0;
        c->remounting_rw = 0;
        c->always_chk_crc = 0;
        err = dbg_check_space_info(c);
@@ -1676,7 +1676,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
        int i, err;
        ubifs_assert(!c->need_recovery);
-        ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
+        ubifs_assert(!c->ro_mount);
        mutex_lock(&c->umount_mutex);
        if (c->bgt) {
@@ -1686,10 +1686,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
        dbg_save_space_info(c);
-        for (i = 0; i < c->jhead_cnt; i++) {
+        for (i = 0; i < c->jhead_cnt; i++)
                ubifs_wbuf_sync(&c->jheads[i].wbuf);
-                hrtimer_cancel(&c->jheads[i].wbuf.timer);
-        }
        c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
        c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
@@ -1704,6 +1702,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
        vfree(c->ileb_buf);
        c->ileb_buf = NULL;
        ubifs_lpt_free(c, 1);
+        c->ro_mount = 1;
        err = dbg_check_space_info(c);
        if (err)
                ubifs_ro_mode(c, err);
@@ -1735,7 +1734,7 @@ static void ubifs_put_super(struct super_block *sb)
         * the mutex is locked.
         */
        mutex_lock(&c->umount_mutex);
-        if (!(c->vfs_sb->s_flags & MS_RDONLY)) {
+        if (!c->ro_mount) {
                /*
                 * First of all kill the background thread to make sure it does
                 * not interfere with un-mounting and freeing resources.
@@ -1745,23 +1744,22 @@ static void ubifs_put_super(struct super_block *sb)
                        c->bgt = NULL;
                }
-                /* Synchronize write-buffers */
-                if (c->jheads)
-                        for (i = 0; i < c->jhead_cnt; i++)
-                                ubifs_wbuf_sync(&c->jheads[i].wbuf);
                /*
-                 * On fatal errors c->ro_media is set to 1, in which case we do
+                 * On fatal errors c->ro_error is set to 1, in which case we do
                 * not write the master node.
                 */
-                if (!c->ro_media) {
+                if (!c->ro_error) {
+                        int err;
+                        /* Synchronize write-buffers */
+                        for (i = 0; i < c->jhead_cnt; i++)
+                                ubifs_wbuf_sync(&c->jheads[i].wbuf);
                        /*
                         * We are being cleanly unmounted which means the
                         * orphans were killed - indicate this in the master
                         * node. Also save the reserved GC LEB number.
                         */
-                        int err;
                        c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
                        c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
                        c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
@@ -1774,6 +1772,10 @@ static void ubifs_put_super(struct super_block *sb)
                                 */
                                ubifs_err("failed to write master node, "
                                          "error %d", err);
+                } else {
+                        for (i = 0; i < c->jhead_cnt; i++)
+                                /* Make sure write-buffer timers are canceled */
+                                hrtimer_cancel(&c->jheads[i].wbuf.timer);
                }
        }
@@ -1797,17 +1799,21 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
                return err;
        }
-        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+        if (c->ro_mount && !(*flags & MS_RDONLY)) {
+                if (c->ro_error) {
+                        ubifs_msg("cannot re-mount R/W due to prior errors");
+                        return -EROFS;
+                }
                if (c->ro_media) {
-                        ubifs_msg("cannot re-mount due to prior errors");
+                        ubifs_msg("cannot re-mount R/W - UBI volume is R/O");
                        return -EROFS;
                }
                err = ubifs_remount_rw(c);
                if (err)
                        return err;
-        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
+        } else if (!c->ro_mount && (*flags & MS_RDONLY)) {
-                if (c->ro_media) {
+                if (c->ro_error) {
-                        ubifs_msg("cannot re-mount due to prior errors");
+                        ubifs_msg("cannot re-mount R/O due to prior errors");
                        return -EROFS;
                }
                ubifs_remount_ro(c);
@@ -2032,8 +2038,8 @@ static int sb_test(struct super_block *sb, void *data)
        return c->vi.cdev == *dev;
 }
-static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
-                        const char *name, void *data, struct vfsmount *mnt)
+                        const char *name, void *data)
 {
        struct ubi_volume_desc *ubi;
        struct ubi_volume_info vi;
@@ -2049,9 +2055,9 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
         */
        ubi = open_ubi(name, UBI_READONLY);
        if (IS_ERR(ubi)) {
-                ubifs_err("cannot open \"%s\", error %d",
+                dbg_err("cannot open \"%s\", error %d",
-                          name, (int)PTR_ERR(ubi));
+                        name, (int)PTR_ERR(ubi));
-                return PTR_ERR(ubi);
+                return ERR_CAST(ubi);
        }
        ubi_get_volume_info(ubi, &vi);
@@ -2064,9 +2070,11 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
        }
        if (sb->s_root) {
+                struct ubifs_info *c1 = sb->s_fs_info;
                /* A new mount point for already mounted UBIFS */
                dbg_gen("this ubi volume is already mounted");
-                if ((flags ^ sb->s_flags) & MS_RDONLY) {
+                if (!!(flags & MS_RDONLY) != c1->ro_mount) {
                        err = -EBUSY;
                        goto out_deact;
                }
@@ -2087,20 +2095,19 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
        /* 'fill_super()' opens ubi again so we must close it here */
        ubi_close_volume(ubi);
-        simple_set_mnt(mnt, sb);
+        return dget(sb->s_root);
-        return 0;
 out_deact:
        deactivate_locked_super(sb);
 out_close:
        ubi_close_volume(ubi);
-        return err;
+        return ERR_PTR(err);
 }
 static struct file_system_type ubifs_fs_type = {
        .name    = "ubifs",
        .owner   = THIS_MODULE,
-        .get_sb  = ubifs_get_sb,
+        .mount   = ubifs_mount,
        .kill_sb = kill_anon_super,
 };
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 2194915220e5..ad9cf0133622 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1177,6 +1177,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
        unsigned long time = get_seconds();
        dbg_tnc("search key %s", DBGKEY(key));
+        ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
        znode = c->zroot.znode;
        if (unlikely(!znode)) {
@@ -2966,7 +2967,7 @@ static struct ubifs_znode *right_znode(struct ubifs_info *c,
 *
 * This function searches an indexing node by its first key @key and its
 * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
- * nodes it traverses to TNC. This function is called fro indexing nodes which
+ * nodes it traverses to TNC. This function is called for indexing nodes which
 * were found on the media by scanning, for example when garbage-collecting or
 * when doing in-the-gaps commit. This means that the indexing node which is
 * looked for does not have to have exactly the same leftmost key @key, because
@@ -2988,6 +2989,8 @@ static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
        struct ubifs_znode *znode, *zn;
        int n, nn;
+        ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
        /*
         * The arguments have probably been read off flash, so don't assume
         * they are valid.
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 0c9876b396dd..381d6b207a52 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -119,8 +119,12 @@
 * in TNC. However, when replaying, it is handy to introduce fake "truncation"
 * keys for truncation nodes because the code becomes simpler. So we define
 * %UBIFS_TRUN_KEY type.
+ *
+ * But otherwise, out of the journal reply scope, the truncation keys are
+ * invalid.
 */
-#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
+#define UBIFS_TRUN_KEY    UBIFS_KEY_TYPES_CNT
+#define UBIFS_INVALID_KEY UBIFS_KEY_TYPES_CNT
 /*
 * How much a directory entry/extended attribute entry adds to the parent/host
@@ -1028,6 +1032,8 @@ struct ubifs_debug_info;
 * @max_leb_cnt: maximum count of logical eraseblocks
 * @old_leb_cnt: count of logical eraseblocks before re-size
 * @ro_media: the underlying UBI volume is read-only
+ * @ro_mount: the file-system was mounted as read-only
+ * @ro_error: UBIFS switched to R/O mode because an error happened
 *
 * @dirty_pg_cnt: number of dirty pages (not used)
 * @dirty_zn_cnt: number of dirty znodes
@@ -1168,11 +1174,14 @@ struct ubifs_debug_info;
 * @replay_sqnum: sequence number of node currently being replayed
 * @need_recovery: file-system needs recovery
 * @replaying: set to %1 during journal replay
- * @unclean_leb_list: LEBs to recover when mounting ro to rw
+ * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
- * @rcvrd_mst_node: recovered master node to write when mounting ro to rw
+ *                    mode
+ * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
+ *                  FS to R/W mode
 * @size_tree: inode size information for recovery
- * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY)
+ * @remounting_rw: set while re-mounting from R/O mode to R/W mode
- * @always_chk_crc: always check CRCs (while mounting and remounting rw)
+ * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
+ *                  mode)
 * @mount_opts: UBIFS-specific mount options
 *
 * @dbg: debugging-related information
@@ -1268,7 +1277,9 @@ struct ubifs_info {
        int leb_cnt;
        int max_leb_cnt;
        int old_leb_cnt;
-        int ro_media;
+        unsigned int ro_media:1;
+        unsigned int ro_mount:1;
+        unsigned int ro_error:1;
        atomic_long_t dirty_pg_cnt;
        atomic_long_t dirty_zn_cnt;
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bce..f8def3c8ea4c 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,5 +1,6 @@
 config UDF_FS
        tristate "UDF file system support"
+        depends on BKL # needs serious work to remove
        select CRC_ITU_T
        help
          This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index bf5fc674193c..6d8dc02baebb 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1101,7 +1101,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        inc_nlink(inode);
        inode->i_ctime = current_fs_time(inode->i_sb);
        mark_inode_dirty(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(dentry, inode);
        unlock_kernel();
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 65412d84a45d..4a5c7c61836a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -107,17 +107,16 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
 }
 /* UDF filesystem type */
-static int udf_get_sb(struct file_system_type *fs_type,
+static struct dentry *udf_mount(struct file_system_type *fs_type,
-                      int flags, const char *dev_name, void *data,
+                      int flags, const char *dev_name, void *data)
-                      struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super);
 }
 static struct file_system_type udf_fstype = {
        .owner          = THIS_MODULE,
        .name           = "udf",
-        .get_sb         = udf_get_sb,
+        .mount          = udf_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -1880,6 +1879,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        struct kernel_lb_addr rootdir, fileset;
        struct udf_sb_info *sbi;
+        lock_kernel();
        uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
        uopt.uid = -1;
        uopt.gid = -1;
@@ -1888,8 +1889,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        uopt.dmode = UDF_INVALID_MODE;
        sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
-        if (!sbi)
+        if (!sbi) {
+                unlock_kernel();
                return -ENOMEM;
+        }
        sb->s_fs_info = sbi;
@@ -2035,6 +2038,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
                goto error_out;
        }
        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        unlock_kernel();
        return 0;
 error_out:
@@ -2055,6 +2059,7 @@ error_out:
        kfree(sbi);
        sb->s_fs_info = NULL;
+        unlock_kernel();
        return -EINVAL;
 }
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index e4f10a40768a..30c8f223253d 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,6 +1,7 @@
 config UFS_FS
        tristate "UFS file system support (read only)"
        depends on BLOCK
+        depends on BKL # probably fixable
        help
          BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
          OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index b056f02b1fb3..12f39b9e4437 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        error = ufs_add_nondir(dentry, inode);
        unlock_kernel();
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index d510c1b91817..2c47daed56da 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -696,6 +696,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        unsigned maxsymlen;
        int ret = -EINVAL;
+        lock_kernel();
        uspi = NULL;
        ubh = NULL;
        flags = 0;
@@ -1163,6 +1165,7 @@ magic_found:
                        goto failed;
        UFSD("EXIT\n");
+        unlock_kernel();
        return 0;
 dalloc_failed:
@@ -1174,10 +1177,12 @@ failed:
        kfree(sbi);
        sb->s_fs_info = NULL;
        UFSD("EXIT (FAILED)\n");
+        unlock_kernel();
        return ret;
 failed_nomem:
        UFSD("EXIT (NOMEM)\n");
+        unlock_kernel();
        return -ENOMEM;
 }
@@ -1449,16 +1454,16 @@ static const struct super_operations ufs_super_ops = {
        .show_options   = ufs_show_options,
 };
-static int ufs_get_sb(struct file_system_type *fs_type,
+static struct dentry *ufs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt);
+        return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
 }
 static struct file_system_type ufs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ufs",
-        .get_sb         = ufs_get_sb,
+        .mount          = ufs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 480f28127f09..6100ec0fa1d4 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,7 @@ config XFS_FS
 config XFS_QUOTA
        bool "XFS Quota support"
        depends on XFS_FS
+        select QUOTACTL
        help
          If you say Y here, you will be able to set limits for disk usage on
          a per user and/or a per group basis under XFS.  XFS considers quota
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index b552f816de15..7d287afccde5 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1111,11 +1111,12 @@ xfs_vm_writepage(
                        uptodate = 0;
                /*
-                 * A hole may still be marked uptodate because discard_buffer
+                 * set_page_dirty dirties all buffers in a page, independent
-                 * leaves the flag set.
+                 * of their state.  The dirty state however is entirely
+                 * meaningless for holes (!mapped && uptodate), so skip
+                 * buffers covering holes here.
                 */
                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-                        ASSERT(!buffer_dirty(bh));
                        imap_valid = 0;
                        continue;
                }
@@ -1139,8 +1140,7 @@ xfs_vm_writepage(
                                type = IO_DELAY;
                                flags = BMAPI_ALLOCATE;
-                                if (wbc->sync_mode == WB_SYNC_NONE &&
+                                if (wbc->sync_mode == WB_SYNC_NONE)
-                                    wbc->nonblocking)
                                        flags |= BMAPI_TRYLOCK;
                        }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 286e36e21dae..aa1d353def29 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -188,8 +188,8 @@ _xfs_buf_initialize(
        atomic_set(&bp->b_hold, 1);
        init_completion(&bp->b_iowait);
        INIT_LIST_HEAD(&bp->b_list);
-        INIT_LIST_HEAD(&bp->b_hash_list);
+        RB_CLEAR_NODE(&bp->b_rbnode);
-        init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
+        sema_init(&bp->b_sema, 0); /* held, no waiters */
        XB_SET_OWNER(bp);
        bp->b_target = target;
        bp->b_file_offset = range_base;
@@ -262,8 +262,6 @@ xfs_buf_free(
 {
        trace_xfs_buf_free(bp, _RET_IP_);
-        ASSERT(list_empty(&bp->b_hash_list));
        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                uint            i;
@@ -422,8 +420,10 @@ _xfs_buf_find(
 {
        xfs_off_t               range_base;
        size_t                  range_length;
-        xfs_bufhash_t           *hash;
+        struct xfs_perag        *pag;
-        xfs_buf_t               *bp, *n;
+        struct rb_node          **rbp;
+        struct rb_node          *parent;
+        xfs_buf_t               *bp;
        range_base = (ioff << BBSHIFT);
        range_length = (isize << BBSHIFT);
@@ -432,14 +432,37 @@ _xfs_buf_find(
        ASSERT(!(range_length < (1 << btp->bt_sshift)));
        ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
-        hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
+        /* get tree root */
+        pag = xfs_perag_get(btp->bt_mount,
-        spin_lock(&hash->bh_lock);
+                                xfs_daddr_to_agno(btp->bt_mount, ioff));
-        list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
+        /* walk tree */
-                ASSERT(btp == bp->b_target);
+        spin_lock(&pag->pag_buf_lock);
-                if (bp->b_file_offset == range_base &&
+        rbp = &pag->pag_buf_tree.rb_node;
-                    bp->b_buffer_length == range_length) {
+        parent = NULL;
+        bp = NULL;
+        while (*rbp) {
+                parent = *rbp;
+                bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+                if (range_base < bp->b_file_offset)
+                        rbp = &(*rbp)->rb_left;
+                else if (range_base > bp->b_file_offset)
+                        rbp = &(*rbp)->rb_right;
+                else {
+                        /*
+                         * found a block offset match. If the range doesn't
+                         * match, the only way this is allowed is if the buffer
+                         * in the cache is stale and the transaction that made
+                         * it stale has not yet committed. i.e. we are
+                         * reallocating a busy extent. Skip this buffer and
+                         * continue searching to the right for an exact match.
+                         */
+                        if (bp->b_buffer_length != range_length) {
+                                ASSERT(bp->b_flags & XBF_STALE);
+                                rbp = &(*rbp)->rb_right;
+                                continue;
+                        }
                        atomic_inc(&bp->b_hold);
                        goto found;
                }
@@ -449,17 +472,21 @@ _xfs_buf_find(
        if (new_bp) {
                _xfs_buf_initialize(new_bp, btp, range_base,
                                range_length, flags);
-                new_bp->b_hash = hash;
+                rb_link_node(&new_bp->b_rbnode, parent, rbp);
-                list_add(&new_bp->b_hash_list, &hash->bh_list);
+                rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+                /* the buffer keeps the perag reference until it is freed */
+                new_bp->b_pag = pag;
+                spin_unlock(&pag->pag_buf_lock);
        } else {
                XFS_STATS_INC(xb_miss_locked);
+                spin_unlock(&pag->pag_buf_lock);
+                xfs_perag_put(pag);
        }
-        spin_unlock(&hash->bh_lock);
        return new_bp;
 found:
-        spin_unlock(&hash->bh_lock);
+        spin_unlock(&pag->pag_buf_lock);
+        xfs_perag_put(pag);
        /* Attempt to get the semaphore without sleeping,
         * if this does not work then we need to drop the
@@ -625,8 +652,7 @@ void
 xfs_buf_readahead(
        xfs_buftarg_t           *target,
        xfs_off_t               ioff,
-        size_t                  isize,
+        size_t                  isize)
-        xfs_buf_flags_t         flags)
 {
        struct backing_dev_info *bdi;
@@ -634,8 +660,42 @@ xfs_buf_readahead(
        if (bdi_read_congested(bdi))
                return;
-        flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
+        xfs_buf_read(target, ioff, isize,
-        xfs_buf_read(target, ioff, isize, flags);
+                     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
+}
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing.
+ */
+struct xfs_buf *
+xfs_buf_read_uncached(
+        struct xfs_mount        *mp,
+        struct xfs_buftarg      *target,
+        xfs_daddr_t             daddr,
+        size_t                  length,
+        int                     flags)
+{
+        xfs_buf_t               *bp;
+        int                     error;
+        bp = xfs_buf_get_uncached(target, length, flags);
+        if (!bp)
+                return NULL;
+        /* set up the buffer for a read IO */
+        xfs_buf_lock(bp);
+        XFS_BUF_SET_ADDR(bp, daddr);
+        XFS_BUF_READ(bp);
+        XFS_BUF_BUSY(bp);
+        xfsbdstrat(mp, bp);
+        error = xfs_buf_iowait(bp);
+        if (error || bp->b_error) {
+                xfs_buf_relse(bp);
+                return NULL;
+        }
+        return bp;
 }
 xfs_buf_t *
@@ -707,9 +767,10 @@ xfs_buf_associate_memory(
 }
 xfs_buf_t *
-xfs_buf_get_noaddr(
+xfs_buf_get_uncached(
+        struct xfs_buftarg      *target,
        size_t                  len,
-        xfs_buftarg_t           *target)
+        int                     flags)
 {
        unsigned long           page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
        int                     error, i;
@@ -725,7 +786,7 @@ xfs_buf_get_noaddr(
                goto fail_free_buf;
        for (i = 0; i < page_count; i++) {
-                bp->b_pages[i] = alloc_page(GFP_KERNEL);
+                bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
                if (!bp->b_pages[i])
                        goto fail_free_mem;
        }
@@ -740,7 +801,7 @@ xfs_buf_get_noaddr(
        xfs_buf_unlock(bp);
-        trace_xfs_buf_get_noaddr(bp, _RET_IP_);
+        trace_xfs_buf_get_uncached(bp, _RET_IP_);
        return bp;
 fail_free_mem:
@@ -774,29 +835,30 @@ void
 xfs_buf_rele(
        xfs_buf_t               *bp)
 {
-        xfs_bufhash_t           *hash = bp->b_hash;
+        struct xfs_perag        *pag = bp->b_pag;
        trace_xfs_buf_rele(bp, _RET_IP_);
-        if (unlikely(!hash)) {
+        if (!pag) {
                ASSERT(!bp->b_relse);
+                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
                        xfs_buf_free(bp);
                return;
        }
+        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
        ASSERT(atomic_read(&bp->b_hold) > 0);
-        if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
+        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
                if (bp->b_relse) {
                        atomic_inc(&bp->b_hold);
-                        spin_unlock(&hash->bh_lock);
+                        spin_unlock(&pag->pag_buf_lock);
-                        (*(bp->b_relse)) (bp);
+                        bp->b_relse(bp);
-                } else if (bp->b_flags & XBF_FS_MANAGED) {
-                        spin_unlock(&hash->bh_lock);
                } else {
                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
-                        list_del_init(&bp->b_hash_list);
+                        rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
-                        spin_unlock(&hash->bh_lock);
+                        spin_unlock(&pag->pag_buf_lock);
+                        xfs_perag_put(pag);
                        xfs_buf_free(bp);
                }
        }
@@ -859,7 +921,7 @@ xfs_buf_lock(
        trace_xfs_buf_lock(bp, _RET_IP_);
        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
-                xfs_log_force(bp->b_mount, 0);
+                xfs_log_force(bp->b_target->bt_mount, 0);
        if (atomic_read(&bp->b_io_remaining))
                blk_run_address_space(bp->b_target->bt_mapping);
        down(&bp->b_sema);
@@ -924,19 +986,7 @@ xfs_buf_iodone_work(
        xfs_buf_t               *bp =
                container_of(work, xfs_buf_t, b_iodone_work);
-        /*
+        if (bp->b_iodone)
-         * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
-         * ordered flag and reissue them.  Because we can't tell the higher
-         * layers directly that they should not issue ordered I/O anymore, they
-         * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
-         */
-        if ((bp->b_error == EOPNOTSUPP) &&
-            (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
-                trace_xfs_buf_ordered_retry(bp, _RET_IP_);
-                bp->b_flags &= ~XBF_ORDERED;
-                bp->b_flags |= _XFS_BARRIER_FAILED;
-                xfs_buf_iorequest(bp);
-        } else if (bp->b_iodone)
                (*(bp->b_iodone))(bp);
        else if (bp->b_flags & XBF_ASYNC)
                xfs_buf_relse(bp);
@@ -982,7 +1032,6 @@ xfs_bwrite(
 {
        int                     error;
-        bp->b_mount = mp;
        bp->b_flags |= XBF_WRITE;
        bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
@@ -1003,8 +1052,6 @@ xfs_bdwrite(
 {
        trace_xfs_buf_bdwrite(bp, _RET_IP_);
-        bp->b_mount = mp;
        bp->b_flags &= ~XBF_READ;
        bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
@@ -1013,7 +1060,7 @@ xfs_bdwrite(
 /*
 * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call biodone
+ * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
 * so that the proper iodone callbacks get called.
 */
 STATIC int
@@ -1030,21 +1077,21 @@ xfs_bioerror(
        XFS_BUF_ERROR(bp, EIO);
        /*
-         * We're calling biodone, so delete XBF_DONE flag.
+         * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
         */
        XFS_BUF_UNREAD(bp);
        XFS_BUF_UNDELAYWRITE(bp);
        XFS_BUF_UNDONE(bp);
        XFS_BUF_STALE(bp);
-        xfs_biodone(bp);
+        xfs_buf_ioend(bp, 0);
        return EIO;
 }
 /*
 * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the biodone call.
+ * here ourselves, and avoiding the xfs_buf_ioend call.
 * This is meant for userdata errors; metadata bufs come with
 * iodone functions attached, so that we can track down errors.
 */
@@ -1093,7 +1140,7 @@ int
 xfs_bdstrat_cb(
        struct xfs_buf  *bp)
 {
-        if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
+        if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
                trace_xfs_bdstrat_shut(bp, _RET_IP_);
                /*
                 * Metadata write that didn't get logged but
@@ -1195,7 +1242,7 @@ _xfs_buf_ioapply(
        if (bp->b_flags & XBF_ORDERED) {
                ASSERT(!(bp->b_flags & XBF_READ));
-                rw = WRITE_BARRIER;
+                rw = WRITE_FLUSH_FUA;
        } else if (bp->b_flags & XBF_LOG_BUFFER) {
                ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
                bp->b_flags &= ~_XBF_RUN_QUEUES;
@@ -1399,62 +1446,24 @@ xfs_buf_iomove(
 */
 void
 xfs_wait_buftarg(
-        xfs_buftarg_t   *btp)
+        struct xfs_buftarg      *btp)
-{
-        xfs_buf_t       *bp, *n;
-        xfs_bufhash_t   *hash;
-        uint            i;
-        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-                hash = &btp->bt_hash[i];
-again:
-                spin_lock(&hash->bh_lock);
-                list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-                        ASSERT(btp == bp->b_target);
-                        if (!(bp->b_flags & XBF_FS_MANAGED)) {
-                                spin_unlock(&hash->bh_lock);
-                                /*
-                                 * Catch superblock reference count leaks
-                                 * immediately
-                                 */
-                                BUG_ON(bp->b_bn == 0);
-                                delay(100);
-                                goto again;
-                        }
-                }
-                spin_unlock(&hash->bh_lock);
-        }
-}
-/*
- *      Allocate buffer hash table for a given target.
- *      For devices containing metadata (i.e. not the log/realtime devices)
- *      we need to allocate a much larger hash table.
- */
-STATIC void
-xfs_alloc_bufhash(
-        xfs_buftarg_t           *btp,
-        int                     external)
 {
-        unsigned int            i;
+        struct xfs_perag        *pag;
+        uint                    i;
-        btp->bt_hashshift = external ? 3 : 12;  /* 8 or 4096 buckets */
+        for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
-        btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
+                pag = xfs_perag_get(btp->bt_mount, i);
-                                         sizeof(xfs_bufhash_t));
+                spin_lock(&pag->pag_buf_lock);
-        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
+                while (rb_first(&pag->pag_buf_tree)) {
-                spin_lock_init(&btp->bt_hash[i].bh_lock);
+                        spin_unlock(&pag->pag_buf_lock);
-                INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
+                        delay(100);
+                        spin_lock(&pag->pag_buf_lock);
+                }
+                spin_unlock(&pag->pag_buf_lock);
+                xfs_perag_put(pag);
        }
 }
-STATIC void
-xfs_free_bufhash(
-        xfs_buftarg_t           *btp)
-{
-        kmem_free_large(btp->bt_hash);
-        btp->bt_hash = NULL;
-}
 /*
 *      buftarg list for delwrite queue processing
 */
@@ -1487,7 +1496,6 @@ xfs_free_buftarg(
        xfs_flush_buftarg(btp, 1);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
                xfs_blkdev_issue_flush(btp);
-        xfs_free_bufhash(btp);
        iput(btp->bt_mapping->host);
        /* Unregister the buftarg first so that we don't get a
@@ -1572,6 +1580,7 @@ xfs_mapping_buftarg(
                        XFS_BUFTARG_NAME(btp));
                return ENOMEM;
        }
+        inode->i_ino = get_next_ino();
        inode->i_mode = S_IFBLK;
        inode->i_bdev = bdev;
        inode->i_rdev = bdev->bd_dev;
@@ -1609,6 +1618,7 @@ out_error:
 xfs_buftarg_t *
 xfs_alloc_buftarg(
+        struct xfs_mount        *mp,
        struct block_device     *bdev,
        int                     external,
        const char              *fsname)
@@ -1617,6 +1627,7 @@ xfs_alloc_buftarg(
        btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
+        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
        btp->bt_bdev = bdev;
        if (xfs_setsize_buftarg_early(btp, bdev))
@@ -1625,7 +1636,6 @@ xfs_alloc_buftarg(
                goto error;
        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
-        xfs_alloc_bufhash(btp, external);
        return btp;
 error:
@@ -1771,7 +1781,6 @@ xfs_buf_delwri_split(
        INIT_LIST_HEAD(list);
        spin_lock(dwlk);
        list_for_each_entry_safe(bp, n, dwq, b_list) {
-                trace_xfs_buf_delwri_split(bp, _RET_IP_);
                ASSERT(bp->b_flags & XBF_DELWRI);
                if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1785,6 +1794,7 @@ xfs_buf_delwri_split(
                                         _XBF_RUN_QUEUES);
                        bp->b_flags |= XBF_WRITE;
                        list_move_tail(&bp->b_list, list);
+                        trace_xfs_buf_delwri_split(bp, _RET_IP_);
                } else
                        skipped++;
        }
@@ -1916,7 +1926,7 @@ xfs_flush_buftarg(
                        bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
                        list_del_init(&bp->b_list);
-                        xfs_iowait(bp);
+                        xfs_buf_iowait(bp);
                        xfs_buf_relse(bp);
                }
        }
@@ -1933,7 +1943,7 @@ xfs_buf_init(void)
                goto out;
        xfslogd_workqueue = alloc_workqueue("xfslogd",
-                                        WQ_RESCUER | WQ_HIGHPRI, 1);
+                                        WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 2a05614f0b92..383a3f37cf98 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -51,7 +51,6 @@ typedef enum {
 #define XBF_DONE        (1 << 5) /* all pages in the buffer uptodate */
 #define XBF_DELWRI      (1 << 6) /* buffer has dirty pages */
 #define XBF_STALE       (1 << 7) /* buffer has been staled, do not find it */
-#define XBF_FS_MANAGED  (1 << 8) /* filesystem controls freeing memory */
 #define XBF_ORDERED     (1 << 11)/* use ordered writes */
 #define XBF_READ_AHEAD  (1 << 12)/* asynchronous read-ahead */
 #define XBF_LOG_BUFFER  (1 << 13)/* this is a buffer used for the log */
@@ -86,14 +85,6 @@ typedef enum {
 */
 #define _XBF_PAGE_LOCKED        (1 << 22)
-/*
- * If we try a barrier write, but it fails we have to communicate
- * this to the upper layers.  Unfortunately b_error gets overwritten
- * when the buffer is re-issued so we have to add another flag to
- * keep this information.
- */
-#define _XFS_BARRIER_FAILED     (1 << 23)
 typedef unsigned int xfs_buf_flags_t;
 #define XFS_BUF_FLAGS \
@@ -104,7 +95,6 @@ typedef unsigned int xfs_buf_flags_t;
        { XBF_DONE,             "DONE" }, \
        { XBF_DELWRI,           "DELWRI" }, \
        { XBF_STALE,            "STALE" }, \
-        { XBF_FS_MANAGED,       "FS_MANAGED" }, \
        { XBF_ORDERED,          "ORDERED" }, \
        { XBF_READ_AHEAD,       "READ_AHEAD" }, \
        { XBF_LOCK,             "LOCK" },       /* should never be set */\
@@ -114,8 +104,7 @@ typedef unsigned int xfs_buf_flags_t;
        { _XBF_PAGES,           "PAGES" }, \
        { _XBF_RUN_QUEUES,      "RUN_QUEUES" }, \
        { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
-        { _XBF_PAGE_LOCKED,     "PAGE_LOCKED" }, \
+        { _XBF_PAGE_LOCKED,     "PAGE_LOCKED" }
-        { _XFS_BARRIER_FAILED,  "BARRIER_FAILED" }
 typedef enum {
@@ -132,14 +121,11 @@ typedef struct xfs_buftarg {
        dev_t                   bt_dev;
        struct block_device     *bt_bdev;
        struct address_space    *bt_mapping;
+        struct xfs_mount        *bt_mount;
        unsigned int            bt_bsize;
        unsigned int            bt_sshift;
        size_t                  bt_smask;
-        /* per device buffer hash table */
-        uint                    bt_hashshift;
-        xfs_bufhash_t           *bt_hash;
        /* per device delwri queue */
        struct task_struct      *bt_task;
        struct list_head        bt_list;
@@ -167,34 +153,41 @@ typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 #define XB_PAGES        2
 typedef struct xfs_buf {
+        /*
+         * first cacheline holds all the fields needed for an uncontended cache
+         * hit to be fully processed. The semaphore straddles the cacheline
+         * boundary, but the counter and lock sits on the first cacheline,
+         * which is the only bit that is touched if we hit the semaphore
+         * fast-path on locking.
+         */
+        struct rb_node          b_rbnode;       /* rbtree node */
+        xfs_off_t               b_file_offset;  /* offset in file */
+        size_t                  b_buffer_length;/* size of buffer in bytes */
+        atomic_t                b_hold;         /* reference count */
+        xfs_buf_flags_t         b_flags;        /* status flags */
        struct semaphore        b_sema;         /* semaphore for lockables */
-        unsigned long           b_queuetime;    /* time buffer was queued */
-        atomic_t                b_pin_count;    /* pin count */
        wait_queue_head_t       b_waiters;      /* unpin waiters */
        struct list_head        b_list;
-        xfs_buf_flags_t         b_flags;        /* status flags */
+        struct xfs_perag        *b_pag;         /* contains rbtree root */
-        struct list_head        b_hash_list;    /* hash table list */
-        xfs_bufhash_t           *b_hash;        /* hash table list start */
        xfs_buftarg_t           *b_target;      /* buffer target (device) */
-        atomic_t                b_hold;         /* reference count */
        xfs_daddr_t             b_bn;           /* block number for I/O */
-        xfs_off_t               b_file_offset;  /* offset in file */
-        size_t                  b_buffer_length;/* size of buffer in bytes */
        size_t                  b_count_desired;/* desired transfer size */
        void                    *b_addr;        /* virtual address of buffer */
        struct work_struct      b_iodone_work;
-        atomic_t                b_io_remaining; /* #outstanding I/O requests */
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
        xfs_buf_relse_t         b_relse;        /* releasing function */
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
-        struct xfs_mount        *b_mount;
-        unsigned short          b_error;        /* error code on I/O */
-        unsigned int            b_page_count;   /* size of page array */
-        unsigned int            b_offset;       /* page offset in first page */
        struct page             **b_pages;      /* array of page pointers */
        struct page             *b_page_array[XB_PAGES]; /* inline pages */
+        unsigned long           b_queuetime;    /* time buffer was queued */
+        atomic_t                b_pin_count;    /* pin count */
+        atomic_t                b_io_remaining; /* #outstanding I/O requests */
+        unsigned int            b_page_count;   /* size of page array */
+        unsigned int            b_offset;       /* page offset in first page */
+        unsigned short          b_error;        /* error code on I/O */
 #ifdef XFS_BUF_LOCK_TRACKING
        int                     b_last_holder;
 #endif
@@ -213,11 +206,13 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
                                xfs_buf_flags_t);
 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
-extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
+extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
 extern void xfs_buf_hold(xfs_buf_t *);
-extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
+extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
-                                xfs_buf_flags_t);
+struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
+                                struct xfs_buftarg *target,
+                                xfs_daddr_t daddr, size_t length, int flags);
 /* Releasing Buffers */
 extern void xfs_buf_free(xfs_buf_t *);
@@ -242,6 +237,8 @@ extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
                                xfs_buf_rw_t);
+#define xfs_buf_zero(bp, off, len) \
+            xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
 static inline int xfs_buf_geterror(xfs_buf_t *bp)
 {
@@ -276,8 +273,6 @@ extern void xfs_buf_terminate(void);
                                        XFS_BUF_DONE(bp);       \
                                } while (0)
-#define XFS_BUF_UNMANAGE(bp)    ((bp)->b_flags &= ~XBF_FS_MANAGED)
 #define XFS_BUF_DELAYWRITE(bp)          ((bp)->b_flags |= XBF_DELWRI)
 #define XFS_BUF_UNDELAYWRITE(bp)        xfs_buf_delwri_dequeue(bp)
 #define XFS_BUF_ISDELAYWRITE(bp)        ((bp)->b_flags & XBF_DELWRI)
@@ -356,25 +351,11 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
        xfs_buf_rele(bp);
 }
-#define xfs_biodone(bp)         xfs_buf_ioend(bp, 0)
-#define xfs_biomove(bp, off, len, data, rw) \
-            xfs_buf_iomove((bp), (off), (len), (data), \
-                ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
-#define xfs_biozero(bp, off, len) \
-            xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-#define xfs_iowait(bp)  xfs_buf_iowait(bp)
-#define xfs_baread(target, rablkno, ralen)  \
-        xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
 /*
 *      Handling of buftargs.
 */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
+                        struct block_device *, int, const char *);
 extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
deleted file mode 100644
index 55bddf3b6091..000000000000
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_CRED_H__
-#define __XFS_CRED_H__
-#include <linux/capability.h>
-/*
- * Credentials
- */
-typedef const struct cred cred_t;
-#endif  /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1f279b012f94..ed88ed16811c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -32,10 +32,9 @@ xfs_tosspages(
        xfs_off_t       last,
        int             fiopt)
 {
-        struct address_space *mapping = VFS_I(ip)->i_mapping;
+        /* can't toss partial tail pages, so mask them out */
+        last &= ~(PAGE_SIZE - 1);
-        if (mapping->nrpages)
+        truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
-                truncate_inode_pages(mapping, first);
 }
 int
@@ -50,12 +49,11 @@ xfs_flushinval_pages(
        trace_xfs_pagecache_inval(ip, first, last);
-        if (mapping->nrpages) {
+        xfs_iflags_clear(ip, XFS_ITRUNCATED);
-                xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        ret = filemap_write_and_wait_range(mapping, first,
-                ret = filemap_write_and_wait(mapping);
+                                last == -1 ? LLONG_MAX : last);
-                if (!ret)
+        if (!ret)
-                        truncate_inode_pages(mapping, first);
+                truncate_inode_pages_range(mapping, first, last);
-        }
        return -ret;
 }
@@ -71,10 +69,9 @@ xfs_flush_pages(
        int             ret = 0;
        int             ret2;
-        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+        xfs_iflags_clear(ip, XFS_ITRUNCATED);
-                xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        ret = -filemap_fdatawrite_range(mapping, first,
-                ret = -filemap_fdatawrite(mapping);
+                                last == -1 ? LLONG_MAX : last);
-        }
        if (flags & XBF_ASYNC)
                return ret;
        ret2 = xfs_wait_on_pages(ip, first, last);
@@ -91,7 +88,9 @@ xfs_wait_on_pages(
 {
        struct address_space *mapping = VFS_I(ip)->i_mapping;
-        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
-                return -filemap_fdatawait(mapping);
+                return -filemap_fdatawait_range(mapping, first,
+                                        last == -1 ? ip->i_size - 1 : last);
+        }
        return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 2ae8b1ccb02e..76e81cff70b9 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -16,7 +16,6 @@
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 #include "xfs.h"
-#include "xfs_cred.h"
 #include "xfs_sysctl.h"
 /*
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
deleted file mode 100644
index 69f71caf061c..000000000000
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_GLOBALS_H__
-#define __XFS_GLOBALS_H__
-extern uint64_t xfs_panic_mask;         /* set to cause more panics */
-#endif  /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 3b9e626f7cd1..ad442d9e392e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -416,7 +416,7 @@ xfs_attrlist_by_handle(
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
-        kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+        kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
        if (!kbuf)
                goto out_dput;
@@ -790,7 +790,7 @@ xfs_ioc_fsgetxattr(
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        fa.fsx_xflags = xfs_ip2xflags(ip);
        fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
-        fa.fsx_projid = ip->i_d.di_projid;
+        fa.fsx_projid = xfs_get_projid(ip);
        if (attr) {
                if (ip->i_afp) {
@@ -909,10 +909,10 @@ xfs_ioctl_setattr(
                return XFS_ERROR(EIO);
        /*
-         * Disallow 32bit project ids because on-disk structure
+         * Disallow 32bit project ids when projid32bit feature is not enabled.
-         * is 16bit only.
         */
-        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1))
+        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
+                        !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
                return XFS_ERROR(EINVAL);
        /*
@@ -961,7 +961,7 @@ xfs_ioctl_setattr(
        if (mask & FSX_PROJID) {
                if (XFS_IS_QUOTA_RUNNING(mp) &&
                    XFS_IS_PQUOTA_ON(mp) &&
-                    ip->i_d.di_projid != fa->fsx_projid) {
+                    xfs_get_projid(ip) != fa->fsx_projid) {
                        ASSERT(tp);
                        code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
                                                capable(CAP_FOWNER) ?
@@ -1063,12 +1063,12 @@ xfs_ioctl_setattr(
                 * Change the ownerships and register quota modifications
                 * in the transaction.
                 */
-                if (ip->i_d.di_projid != fa->fsx_projid) {
+                if (xfs_get_projid(ip) != fa->fsx_projid) {
                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
                                olddquot = xfs_qm_vop_chown(tp, ip,
                                                        &ip->i_gdquot, gdqp);
                        }
-                        ip->i_d.di_projid = fa->fsx_projid;
+                        xfs_set_projid(ip, fa->fsx_projid);
                        /*
                         * We may have to rev the inode as well as
@@ -1088,8 +1088,8 @@ xfs_ioctl_setattr(
                xfs_diflags_to_linux(ip);
        }
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
        XFS_STATS_INC(xs_ig_attrchg);
@@ -1301,7 +1301,8 @@ xfs_file_ioctl(
        case XFS_IOC_ALLOCSP64:
        case XFS_IOC_FREESP64:
        case XFS_IOC_RESVSP64:
-        case XFS_IOC_UNRESVSP64: {
+        case XFS_IOC_UNRESVSP64:
+        case XFS_IOC_ZERO_RANGE: {
                xfs_flock64_t           bf;
                if (copy_from_user(&bf, arg, sizeof(bf)))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 6c83f7f62dc9..b3486dfa5520 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -164,7 +164,8 @@ xfs_ioctl32_bstat_copyin(
            get_user(bstat->bs_extsize, &bstat32->bs_extsize)   ||
            get_user(bstat->bs_extents, &bstat32->bs_extents)   ||
            get_user(bstat->bs_gen,     &bstat32->bs_gen)       ||
-            get_user(bstat->bs_projid,  &bstat32->bs_projid)    ||
+            get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
+            get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
            get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
            get_user(bstat->bs_dmstate, &bstat32->bs_dmstate)   ||
            get_user(bstat->bs_aextents, &bstat32->bs_aextents))
@@ -218,6 +219,7 @@ xfs_bulkstat_one_fmt_compat(
            put_user(buffer->bs_extents,  &p32->bs_extents)     ||
            put_user(buffer->bs_gen,      &p32->bs_gen)         ||
            put_user(buffer->bs_projid,   &p32->bs_projid)      ||
+            put_user(buffer->bs_projid_hi,      &p32->bs_projid_hi)     ||
            put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)    ||
            put_user(buffer->bs_dmstate,  &p32->bs_dmstate)     ||
            put_user(buffer->bs_aextents, &p32->bs_aextents))
@@ -574,6 +576,7 @@ xfs_file_compat_ioctl(
        case XFS_IOC_FSGEOMETRY_V1:
        case XFS_IOC_FSGROWFSDATA:
        case XFS_IOC_FSGROWFSRT:
+        case XFS_IOC_ZERO_RANGE:
                return xfs_file_ioctl(filp, cmd, p);
 #else
        case XFS_IOC_ALLOCSP_32:
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 1024c4f8ba0d..08b605792a99 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -65,8 +65,10 @@ typedef struct compat_xfs_bstat {
        __s32           bs_extsize;     /* extent size                  */
        __s32           bs_extents;     /* number of extents            */
        __u32           bs_gen;         /* generation count             */
-        __u16           bs_projid;      /* project id                   */
+        __u16           bs_projid_lo;   /* lower part of project id     */
-        unsigned char   bs_pad[14];     /* pad space, unused            */
+#define bs_projid       bs_projid_lo    /* (previously just bs_projid)  */
+        __u16           bs_projid_hi;   /* high part of project id      */
+        unsigned char   bs_pad[12];     /* pad space, unused            */
        __u32           bs_dmevmask;    /* DMIG event mask              */
        __u16           bs_dmstate;     /* DMIG state info              */
        __u16           bs_aextents;    /* attribute number of extents  */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index b1fc2a6bfe83..94d5fd6a2973 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -95,41 +95,6 @@ xfs_mark_inode_dirty(
 }
 /*
- * Change the requested timestamp in the given inode.
- * We don't lock across timestamp updates, and we don't log them but
- * we do record the fact that there is dirty information in core.
- */
-void
-xfs_ichgtime(
-        xfs_inode_t     *ip,
-        int             flags)
-{
-        struct inode    *inode = VFS_I(ip);
-        timespec_t      tv;
-        int             sync_it = 0;
-        tv = current_fs_time(inode->i_sb);
-        if ((flags & XFS_ICHGTIME_MOD) &&
-            !timespec_equal(&inode->i_mtime, &tv)) {
-                inode->i_mtime = tv;
-                sync_it = 1;
-        }
-        if ((flags & XFS_ICHGTIME_CHG) &&
-            !timespec_equal(&inode->i_ctime, &tv)) {
-                inode->i_ctime = tv;
-                sync_it = 1;
-        }
-        /*
-         * Update complete - now make sure everyone knows that the inode
-         * is dirty.
-         */
-        if (sync_it)
-                xfs_mark_inode_dirty_sync(ip);
-}
-/*
 * Hook in SELinux.  This is not quite correct yet, what we really need
 * here (as we do for default ACLs) is a mechanism by which creation of
 * these attrs can be journalled at inode creation time (along with the
@@ -224,7 +189,7 @@ xfs_vn_mknod(
        }
        xfs_dentry_to_name(&name, dentry);
-        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
+        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
        if (unlikely(error))
                goto out_free_acl;
@@ -352,7 +317,7 @@ xfs_vn_link(
        if (unlikely(error))
                return -error;
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(dentry, inode);
        return 0;
 }
@@ -397,7 +362,7 @@ xfs_vn_symlink(
                (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
        xfs_dentry_to_name(&name, dentry);
-        error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
+        error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
        if (unlikely(error))
                goto out;
@@ -795,7 +760,10 @@ xfs_setup_inode(
        inode->i_ino = ip->i_ino;
        inode->i_state = I_NEW;
-        inode_add_to_lists(ip->i_mount->m_super, inode);
+        inode_sb_list_add(inode);
+        /* make the inode look hashed for the writeback code */
+        hlist_add_fake(&inode->i_hash);
        inode->i_mode   = ip->i_d.di_mode;
        inode->i_nlink  = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 2fa0bd9ebc7f..214ddd71ff79 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -71,6 +71,7 @@
 #include <linux/random.h>
 #include <linux/ctype.h>
 #include <linux/writeback.h>
+#include <linux/capability.h>
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -79,14 +80,12 @@
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
-#include <xfs_cred.h>
 #include <xfs_vnode.h>
 #include <xfs_stats.h>
 #include <xfs_sysctl.h>
 #include <xfs_iops.h>
 #include <xfs_aops.h>
 #include <xfs_super.h>
-#include <xfs_globals.h>
 #include <xfs_buf.h>
 /*
@@ -144,7 +143,7 @@
 #define SYNCHRONIZE()   barrier()
 #define __return_address __builtin_return_address(0)
-#define dfltprid        0
+#define XFS_PROJID_DEFAULT      0
 #define MAXPATHLEN      1024
 #define MIN(a,b)        (min(a,b))
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a4e07974955b..064f964d4f3c 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -44,7 +44,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_vnodeops.h"
-#include "xfs_version.h"
 #include "xfs_log_priv.h"
 #include "xfs_trans_priv.h"
 #include "xfs_filestream.h"
@@ -354,9 +353,6 @@ xfs_parseargs(
                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
-                        cmn_err(CE_WARN,
-                                "Enabling EXPERIMENTAL delayed logging feature "
-                                "- use at your own risk.\n");
                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, "ihashsize")) {
@@ -577,7 +573,7 @@ xfs_max_file_offset(
        /* Figure out maximum filesize, on Linux this can depend on
         * the filesystem blocksize (on 32 bit platforms).
-         * __block_prepare_write does this in an [unsigned] long...
+         * __block_write_begin does this in an [unsigned] long...
         *      page->index << (PAGE_CACHE_SHIFT - bbits)
         * So, for page sized blocks (4K on 32 bit platforms),
         * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
@@ -645,7 +641,7 @@ xfs_barrier_test(
        XFS_BUF_ORDERED(sbp);
        xfsbdstrat(mp, sbp);
-        error = xfs_iowait(sbp);
+        error = xfs_buf_iowait(sbp);
        /*
         * Clear all the flags we set and possible error state in the
@@ -693,8 +689,7 @@ void
 xfs_blkdev_issue_flush(
        xfs_buftarg_t           *buftarg)
 {
-        blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
+        blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
-                        BLKDEV_IFL_WAIT);
 }
 STATIC void
@@ -758,18 +753,20 @@ xfs_open_devices(
         * Setup xfs_mount buffer target pointers
         */
        error = ENOMEM;
-        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
+        mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
        if (!mp->m_ddev_targp)
                goto out_close_rtdev;
        if (rtdev) {
-                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
+                mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
+                                                        mp->m_fsname);
                if (!mp->m_rtdev_targp)
                        goto out_free_ddev_targ;
        }
        if (logdev && logdev != ddev) {
-                mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
+                mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
+                                                        mp->m_fsname);
                if (!mp->m_logdev_targp)
                        goto out_free_rtdev_targ;
        } else {
@@ -972,12 +969,7 @@ xfs_fs_inode_init_once(
 /*
 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
- * we catch unlogged VFS level updates to the inode. Care must be taken
+ * we catch unlogged VFS level updates to the inode.
- * here - the transaction code calls mark_inode_dirty_sync() to mark the
- * VFS inode dirty in a transaction and clears the i_update_core field;
- * it must clear the field after calling mark_inode_dirty_sync() to
- * correctly indicate that the dirty state has been propagated into the
- * inode log item.
 *
 * We need the barrier() to maintain correct ordering between unlogged
 * updates and the transaction commit code that clears the i_update_core
@@ -1521,8 +1513,9 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_fsname;
-        if (xfs_icsb_init_counters(mp))
+        error = xfs_icsb_init_counters(mp);
-                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
+        if (error)
+                goto out_close_devices;
        error = xfs_readsb(mp, flags);
        if (error)
@@ -1583,6 +1576,7 @@ xfs_fs_fill_super(
        xfs_freesb(mp);
 out_destroy_counters:
        xfs_icsb_destroy_counters(mp);
+ out_close_devices:
        xfs_close_devices(mp);
 out_free_fsname:
        xfs_free_fsname(mp);
@@ -1612,16 +1606,14 @@ xfs_fs_fill_super(
        goto out_free_sb;
 }
-STATIC int
+STATIC struct dentry *
-xfs_fs_get_sb(
+xfs_fs_mount(
        struct file_system_type *fs_type,
        int                     flags,
        const char              *dev_name,
-        void                    *data,
+        void                    *data)
-        struct vfsmount         *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
-                           mnt);
 }
 static const struct super_operations xfs_super_operations = {
@@ -1642,7 +1634,7 @@ static const struct super_operations xfs_super_operations = {
 static struct file_system_type xfs_fs_type = {
        .owner                  = THIS_MODULE,
        .name                   = "xfs",
-        .get_sb                 = xfs_fs_get_sb,
+        .mount                  = xfs_fs_mount,
        .kill_sb                = kill_block_super,
        .fs_flags               = FS_REQUIRES_DEV,
 };
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 1ef4a4d2d997..50a3266c999e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -62,6 +62,7 @@ extern void xfs_qm_exit(void);
 # define XFS_DBG_STRING         "no debug"
 #endif
+#define XFS_VERSION_STRING      "SGI XFS"
 #define XFS_BUILD_OPTIONS       XFS_ACL_STRING \
                                XFS_SECURITY_STRING \
                                XFS_REALTIME_STRING \
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 81976ffed7d6..afb0d7cfad1c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -39,42 +39,39 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH        32
-STATIC xfs_inode_t *
+STATIC int
-xfs_inode_ag_lookup(
+xfs_inode_ag_walk_grab(
-        struct xfs_mount        *mp,
+        struct xfs_inode        *ip)
-        struct xfs_perag        *pag,
-        uint32_t                *first_index,
-        int                     tag)
 {
-        int                     nr_found;
+        struct inode            *inode = VFS_I(ip);
-        struct xfs_inode        *ip;
-        /*
+        /* nothing to sync during shutdown */
-         * use a gang lookup to find the next inode in the tree
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-         * as the tree is sparse and a gang lookup walks to find
+                return EFSCORRUPTED;
-         * the number of objects requested.
-         */
+        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-        if (tag == XFS_ICI_NO_TAG) {
+        if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                return ENOENT;
-                                (void **)&ip, *first_index, 1);
-        } else {
+        /* If we can't grab the inode, it must on it's way to reclaim. */
-                nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+        if (!igrab(inode))
-                                (void **)&ip, *first_index, 1, tag);
+                return ENOENT;
+        if (is_bad_inode(inode)) {
+                IRELE(ip);
+                return ENOENT;
        }
-        if (!nr_found)
-                return NULL;
-        /*
+        /* inode is valid */
-         * Update the index for the next lookup. Catch overflows
+        return 0;
-         * into the next AG range which can occur if we have inodes
-         * in the last block of the AG and we are currently
-         * pointing to the last inode.
-         */
-        *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-        if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-                return NULL;
-        return ip;
 }
 STATIC int
@@ -83,49 +80,75 @@ xfs_inode_ag_walk(
        struct xfs_perag        *pag,
        int                     (*execute)(struct xfs_inode *ip,
                                           struct xfs_perag *pag, int flags),
-        int                     flags,
+        int                     flags)
-        int                     tag,
-        int                     exclusive,
-        int                     *nr_to_scan)
 {
        uint32_t                first_index;
        int                     last_error = 0;
        int                     skipped;
+        int                     done;
+        int                     nr_found;
 restart:
+        done = 0;
        skipped = 0;
        first_index = 0;
+        nr_found = 0;
        do {
+                struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                int             error = 0;
-                xfs_inode_t     *ip;
+                int             i;
-                if (exclusive)
+                read_lock(&pag->pag_ici_lock);
-                        write_lock(&pag->pag_ici_lock);
+                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-                else
+                                        (void **)batch, first_index,
-                        read_lock(&pag->pag_ici_lock);
+                                        XFS_LOOKUP_BATCH);
-                ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
+                if (!nr_found) {
-                if (!ip) {
+                        read_unlock(&pag->pag_ici_lock);
-                        if (exclusive)
-                                write_unlock(&pag->pag_ici_lock);
-                        else
-                                read_unlock(&pag->pag_ici_lock);
                        break;
                }
-                /* execute releases pag->pag_ici_lock */
+                /*
-                error = execute(ip, pag, flags);
+                 * Grab the inodes before we drop the lock. if we found
-                if (error == EAGAIN) {
+                 * nothing, nr == 0 and the loop will be skipped.
-                        skipped++;
+                 */
-                        continue;
+                for (i = 0; i < nr_found; i++) {
+                        struct xfs_inode *ip = batch[i];
+                        if (done || xfs_inode_ag_walk_grab(ip))
+                                batch[i] = NULL;
+                        /*
+                         * Update the index for the next lookup. Catch overflows
+                         * into the next AG range which can occur if we have inodes
+                         * in the last block of the AG and we are currently
+                         * pointing to the last inode.
+                         */
+                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                                done = 1;
+                }
+                /* unlock now we've grabbed the inodes. */
+                read_unlock(&pag->pag_ici_lock);
+                for (i = 0; i < nr_found; i++) {
+                        if (!batch[i])
+                                continue;
+                        error = execute(batch[i], pag, flags);
+                        IRELE(batch[i]);
+                        if (error == EAGAIN) {
+                                skipped++;
+                                continue;
+                        }
+                        if (error && last_error != EFSCORRUPTED)
+                                last_error = error;
                }
-                if (error)
-                        last_error = error;
                /* bail out if the filesystem is corrupted.  */
                if (error == EFSCORRUPTED)
                        break;
-        } while ((*nr_to_scan)--);
+        } while (nr_found && !done);
        if (skipped) {
                delay(1);
@@ -134,110 +157,32 @@ restart:
        return last_error;
 }
-/*
- * Select the next per-ag structure to iterate during the walk. The reclaim
- * walk is optimised only to walk AGs with reclaimable inodes in them.
- */
-static struct xfs_perag *
-xfs_inode_ag_iter_next_pag(
-        struct xfs_mount        *mp,
-        xfs_agnumber_t          *first,
-        int                     tag)
-{
-        struct xfs_perag        *pag = NULL;
-        if (tag == XFS_ICI_RECLAIM_TAG) {
-                int found;
-                int ref;
-                spin_lock(&mp->m_perag_lock);
-                found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
-                                (void **)&pag, *first, 1, tag);
-                if (found <= 0) {
-                        spin_unlock(&mp->m_perag_lock);
-                        return NULL;
-                }
-                *first = pag->pag_agno + 1;
-                /* open coded pag reference increment */
-                ref = atomic_inc_return(&pag->pag_ref);
-                spin_unlock(&mp->m_perag_lock);
-                trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
-        } else {
-                pag = xfs_perag_get(mp, *first);
-                (*first)++;
-        }
-        return pag;
-}
 int
 xfs_inode_ag_iterator(
        struct xfs_mount        *mp,
        int                     (*execute)(struct xfs_inode *ip,
                                           struct xfs_perag *pag, int flags),
-        int                     flags,
+        int                     flags)
-        int                     tag,
-        int                     exclusive,
-        int                     *nr_to_scan)
 {
        struct xfs_perag        *pag;
        int                     error = 0;
        int                     last_error = 0;
        xfs_agnumber_t          ag;
-        int                     nr;
-        nr = nr_to_scan ? *nr_to_scan : INT_MAX;
        ag = 0;
-        while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
+        while ((pag = xfs_perag_get(mp, ag))) {
-                error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
+                ag = pag->pag_agno + 1;
-                                                exclusive, &nr);
+                error = xfs_inode_ag_walk(mp, pag, execute, flags);
                xfs_perag_put(pag);
                if (error) {
                        last_error = error;
                        if (error == EFSCORRUPTED)
                                break;
                }
-                if (nr <= 0)
-                        break;
        }
-        if (nr_to_scan)
-                *nr_to_scan = nr;
        return XFS_ERROR(last_error);
 }
-/* must be called with pag_ici_lock held and releases it */
-int
-xfs_sync_inode_valid(
-        struct xfs_inode        *ip,
-        struct xfs_perag        *pag)
-{
-        struct inode            *inode = VFS_I(ip);
-        int                     error = EFSCORRUPTED;
-        /* nothing to sync during shutdown */
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                goto out_unlock;
-        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-        error = ENOENT;
-        if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-                goto out_unlock;
-        /* If we can't grab the inode, it must on it's way to reclaim. */
-        if (!igrab(inode))
-                goto out_unlock;
-        if (is_bad_inode(inode)) {
-                IRELE(ip);
-                goto out_unlock;
-        }
-        /* inode is valid */
-        error = 0;
-out_unlock:
-        read_unlock(&pag->pag_ici_lock);
-        return error;
-}
 STATIC int
 xfs_sync_inode_data(
        struct xfs_inode        *ip,
@@ -248,10 +193,6 @@ xfs_sync_inode_data(
        struct address_space *mapping = inode->i_mapping;
        int                     error = 0;
-        error = xfs_sync_inode_valid(ip, pag);
-        if (error)
-                return error;
        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                goto out_wait;
@@ -268,7 +209,6 @@ xfs_sync_inode_data(
 out_wait:
        if (flags & SYNC_WAIT)
                xfs_ioend_wait(ip);
-        IRELE(ip);
        return error;
 }
@@ -280,10 +220,6 @@ xfs_sync_inode_attr(
 {
        int                     error = 0;
-        error = xfs_sync_inode_valid(ip, pag);
-        if (error)
-                return error;
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        if (xfs_inode_clean(ip))
                goto out_unlock;
@@ -302,7 +238,6 @@ xfs_sync_inode_attr(
 out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        IRELE(ip);
        return error;
 }
@@ -318,8 +253,7 @@ xfs_sync_data(
        ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
+        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
-                                      XFS_ICI_NO_TAG, 0, NULL);
        if (error)
                return XFS_ERROR(error);
@@ -337,8 +271,7 @@ xfs_sync_attr(
 {
        ASSERT((flags & ~SYNC_WAIT) == 0);
-        return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
+        return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
-                                     XFS_ICI_NO_TAG, 0, NULL);
 }
 STATIC int
@@ -698,6 +631,43 @@ __xfs_inode_clear_reclaim_tag(
 }
 /*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+        struct xfs_inode        *ip,
+        int                     flags)
+{
+        /*
+         * do some unlocked checks first to avoid unnecceary lock traffic.
+         * The first is a flush lock check, the second is a already in reclaim
+         * check. Only do these checks if we are not going to block on locks.
+         */
+        if ((flags & SYNC_TRYLOCK) &&
+            (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+                return 1;
+        }
+        /*
+         * The radix tree lock here protects a thread in xfs_iget from racing
+         * with us starting reclaim on the inode.  Once we have the
+         * XFS_IRECLAIM flag set it will not touch us.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+        if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+                /* ignore as it is already under reclaim */
+                spin_unlock(&ip->i_flags_lock);
+                return 1;
+        }
+        __xfs_iflags_set(ip, XFS_IRECLAIM);
+        spin_unlock(&ip->i_flags_lock);
+        return 0;
+}
+/*
 * Inodes in different states need to be treated differently, and the return
 * value of xfs_iflush is not sufficient to get this right. The following table
 * lists the inode states and the reclaim actions necessary for non-blocking
@@ -755,23 +725,6 @@ xfs_reclaim_inode(
 {
        int     error = 0;
-        /*
-         * The radix tree lock here protects a thread in xfs_iget from racing
-         * with us starting reclaim on the inode.  Once we have the
-         * XFS_IRECLAIM flag set it will not touch us.
-         */
-        spin_lock(&ip->i_flags_lock);
-        ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                /* ignore as it is already under reclaim */
-                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
-                return 0;
-        }
-        __xfs_iflags_set(ip, XFS_IRECLAIM);
-        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        if (!xfs_iflock_nowait(ip)) {
                if (!(sync_mode & SYNC_WAIT))
@@ -868,13 +821,127 @@ reclaim:
 }
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+int
+xfs_reclaim_inodes_ag(
+        struct xfs_mount        *mp,
+        int                     flags,
+        int                     *nr_to_scan)
+{
+        struct xfs_perag        *pag;
+        int                     error = 0;
+        int                     last_error = 0;
+        xfs_agnumber_t          ag;
+        int                     trylock = flags & SYNC_TRYLOCK;
+        int                     skipped;
+restart:
+        ag = 0;
+        skipped = 0;
+        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+                unsigned long   first_index = 0;
+                int             done = 0;
+                int             nr_found = 0;
+                ag = pag->pag_agno + 1;
+                if (trylock) {
+                        if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+                                skipped++;
+                                xfs_perag_put(pag);
+                                continue;
+                        }
+                        first_index = pag->pag_ici_reclaim_cursor;
+                } else
+                        mutex_lock(&pag->pag_ici_reclaim_lock);
+                do {
+                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+                        int     i;
+                        write_lock(&pag->pag_ici_lock);
+                        nr_found = radix_tree_gang_lookup_tag(
+                                        &pag->pag_ici_root,
+                                        (void **)batch, first_index,
+                                        XFS_LOOKUP_BATCH,
+                                        XFS_ICI_RECLAIM_TAG);
+                        if (!nr_found) {
+                                write_unlock(&pag->pag_ici_lock);
+                                break;
+                        }
+                        /*
+                         * Grab the inodes before we drop the lock. if we found
+                         * nothing, nr == 0 and the loop will be skipped.
+                         */
+                        for (i = 0; i < nr_found; i++) {
+                                struct xfs_inode *ip = batch[i];
+                                if (done || xfs_reclaim_inode_grab(ip, flags))
+                                        batch[i] = NULL;
+                                /*
+                                 * Update the index for the next lookup. Catch
+                                 * overflows into the next AG range which can
+                                 * occur if we have inodes in the last block of
+                                 * the AG and we are currently pointing to the
+                                 * last inode.
+                                 */
+                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                                        done = 1;
+                        }
+                        /* unlock now we've grabbed the inodes. */
+                        write_unlock(&pag->pag_ici_lock);
+                        for (i = 0; i < nr_found; i++) {
+                                if (!batch[i])
+                                        continue;
+                                error = xfs_reclaim_inode(batch[i], pag, flags);
+                                if (error && last_error != EFSCORRUPTED)
+                                        last_error = error;
+                        }
+                        *nr_to_scan -= XFS_LOOKUP_BATCH;
+                } while (nr_found && !done && *nr_to_scan > 0);
+                if (trylock && !done)
+                        pag->pag_ici_reclaim_cursor = first_index;
+                else
+                        pag->pag_ici_reclaim_cursor = 0;
+                mutex_unlock(&pag->pag_ici_reclaim_lock);
+                xfs_perag_put(pag);
+        }
+        /*
+         * if we skipped any AG, and we still have scan count remaining, do
+         * another pass this time using blocking reclaim semantics (i.e
+         * waiting on the reclaim locks and ignoring the reclaim cursors). This
+         * ensure that when we get more reclaimers than AGs we block rather
+         * than spin trying to execute reclaim.
+         */
+        if (trylock && skipped && *nr_to_scan > 0) {
+                trylock = 0;
+                goto restart;
+        }
+        return XFS_ERROR(last_error);
+}
 int
 xfs_reclaim_inodes(
        xfs_mount_t     *mp,
        int             mode)
 {
-        return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
+        int             nr_to_scan = INT_MAX;
-                                        XFS_ICI_RECLAIM_TAG, 1, NULL);
+        return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
 }
 /*
@@ -896,17 +963,16 @@ xfs_reclaim_inode_shrink(
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
+                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
-                                        XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
+                /* terminate if we don't exhaust the scan */
-                /* if we don't exhaust the scan, don't bother coming back */
                if (nr_to_scan > 0)
                        return -1;
       }
        reclaimable = 0;
        ag = 0;
-        while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
+        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-                                        XFS_ICI_RECLAIM_TAG))) {
+                ag = pag->pag_agno + 1;
                reclaimable += pag->pag_ici_reclaimable;
                xfs_perag_put(pag);
        }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index fe78726196f8..32ba6628290c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -47,10 +47,10 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
                                struct xfs_inode *ip);
-int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
+int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-        int flags, int tag, int write_lock, int *nr_to_scan);
+        int flags);
 void xfs_inode_shrinker_register(struct xfs_mount *mp);
 void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index be5dffd282a1..acef2e98c594 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name,	\
                 unsigned long caller_ip),                                      \
        TP_ARGS(mp, agno, refcount, caller_ip))
 DEFINE_PERAG_REF_EVENT(xfs_perag_get);
-DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
@@ -325,13 +325,12 @@ DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
 DEFINE_BUF_EVENT(xfs_buf_cond_lock);
 DEFINE_BUF_EVENT(xfs_buf_unlock);
-DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
 DEFINE_BUF_EVENT(xfs_buf_iowait);
 DEFINE_BUF_EVENT(xfs_buf_iowait_done);
 DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
 DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
 DEFINE_BUF_EVENT(xfs_buf_delwri_split);
-DEFINE_BUF_EVENT(xfs_buf_get_noaddr);
+DEFINE_BUF_EVENT(xfs_buf_get_uncached);
 DEFINE_BUF_EVENT(xfs_bdstrat_shut);
 DEFINE_BUF_EVENT(xfs_buf_item_relse);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone);
diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h
deleted file mode 100644
index f8d279d7563a..000000000000
--- a/fs/xfs/linux-2.6/xfs_version.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VERSION_H__
-#define __XFS_VERSION_H__
-/*
- * Dummy file that can contain a timestamp to put into the
- * XFS init string, to help users keep track of what they're
- * running
- */
-#define XFS_VERSION_STRING "SGI XFS"
-#endif /* __XFS_VERSION_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index e1a2f6800e01..faf8e1a83a12 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -463,87 +463,68 @@ xfs_qm_dqtobp(
        uint                    flags)
 {
        xfs_bmbt_irec_t map;
-        int             nmaps, error;
+        int             nmaps = 1, error;
        xfs_buf_t       *bp;
-        xfs_inode_t     *quotip;
+        xfs_inode_t     *quotip = XFS_DQ_TO_QIP(dqp);
-        xfs_mount_t     *mp;
+        xfs_mount_t     *mp = dqp->q_mount;
        xfs_disk_dquot_t *ddq;
-        xfs_dqid_t      id;
+        xfs_dqid_t      id = be32_to_cpu(dqp->q_core.d_id);
-        boolean_t       newdquot;
        xfs_trans_t     *tp = (tpp ? *tpp : NULL);
-        mp = dqp->q_mount;
+        dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
-        id = be32_to_cpu(dqp->q_core.d_id);
-        nmaps = 1;
-        newdquot = B_FALSE;
-        /*
+        xfs_ilock(quotip, XFS_ILOCK_SHARED);
-         * If we don't know where the dquot lives, find out.
+        if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
-         */
-        if (dqp->q_blkno == (xfs_daddr_t) 0) {
-                /* We use the id as an index */
-                dqp->q_fileoffset = (xfs_fileoff_t)id /
-                                        mp->m_quotainfo->qi_dqperchunk;
-                nmaps = 1;
-                quotip = XFS_DQ_TO_QIP(dqp);
-                xfs_ilock(quotip, XFS_ILOCK_SHARED);
                /*
-                 * Return if this type of quotas is turned off while we didn't
+                 * Return if this type of quotas is turned off while we
-                 * have an inode lock
+                 * didn't have the quota inode lock.
                 */
-                if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+                xfs_iunlock(quotip, XFS_ILOCK_SHARED);
-                        xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+                return ESRCH;
-                        return (ESRCH);
+        }
-                }
+        /*
+         * Find the block map; no allocations yet
+         */
+        error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+                          XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+                          NULL, 0, &map, &nmaps, NULL);
+        xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+        if (error)
+                return error;
+        ASSERT(nmaps == 1);
+        ASSERT(map.br_blockcount == 1);
+        /*
+         * Offset of dquot in the (fixed sized) dquot chunk.
+         */
+        dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+                sizeof(xfs_dqblk_t);
+        ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+        if (map.br_startblock == HOLESTARTBLOCK) {
                /*
-                 * Find the block map; no allocations yet
+                 * We don't allocate unless we're asked to
                 */
-                error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+                if (!(flags & XFS_QMOPT_DQALLOC))
-                                  XFS_DQUOT_CLUSTER_SIZE_FSB,
+                        return ENOENT;
-                                  XFS_BMAPI_METADATA,
-                                  NULL, 0, &map, &nmaps, NULL);
-                xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+                ASSERT(tp);
+                error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
+                                        dqp->q_fileoffset, &bp);
                if (error)
-                        return (error);
+                        return error;
-                ASSERT(nmaps == 1);
+                tp = *tpp;
-                ASSERT(map.br_blockcount == 1);
+        } else {
+                trace_xfs_dqtobp_read(dqp);
                /*
-                 * offset of dquot in the (fixed sized) dquot chunk.
+                 * store the blkno etc so that we don't have to do the
+                 * mapping all the time
                 */
-                dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+                dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-                        sizeof(xfs_dqblk_t);
-                if (map.br_startblock == HOLESTARTBLOCK) {
-                        /*
-                         * We don't allocate unless we're asked to
-                         */
-                        if (!(flags & XFS_QMOPT_DQALLOC))
-                                return (ENOENT);
-                        ASSERT(tp);
-                        if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
-                                                dqp->q_fileoffset, &bp)))
-                                return (error);
-                        tp = *tpp;
-                        newdquot = B_TRUE;
-                } else {
-                        /*
-                         * store the blkno etc so that we don't have to do the
-                         * mapping all the time
-                         */
-                        dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-                }
-        }
-        ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
-        ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
-        /*
-         * Read in the buffer, unless we've just done the allocation
-         * (in which case we already have the buf).
-         */
-        if (!newdquot) {
-                trace_xfs_dqtobp_read(dqp);
                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                                           dqp->q_blkno,
@@ -552,13 +533,14 @@ xfs_qm_dqtobp(
                if (error || !bp)
                        return XFS_ERROR(error);
        }
        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
        /*
         * calculate the location of the dquot inside the buffer.
         */
-        ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+        ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
        /*
         * A simple sanity check in case we got a corrupted dquot...
@@ -1176,18 +1158,18 @@ xfs_qm_dqflush(
        xfs_dquot_t             *dqp,
        uint                    flags)
 {
-        xfs_mount_t             *mp;
+        struct xfs_mount        *mp = dqp->q_mount;
-        xfs_buf_t               *bp;
+        struct xfs_buf          *bp;
-        xfs_disk_dquot_t        *ddqp;
+        struct xfs_disk_dquot   *ddqp;
        int                     error;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        ASSERT(!completion_done(&dqp->q_flush));
        trace_xfs_dqflush(dqp);
        /*
-         * If not dirty, or it's pinned and we are not supposed to
+         * If not dirty, or it's pinned and we are not supposed to block, nada.
-         * block, nada.
         */
        if (!XFS_DQ_IS_DIRTY(dqp) ||
            (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
@@ -1201,40 +1183,46 @@ xfs_qm_dqflush(
         * down forcibly. If that's the case we must not write this dquot
         * to disk, because the log record didn't make it to disk!
         */
-        if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) {
+        if (XFS_FORCED_SHUTDOWN(mp)) {
-                dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+                dqp->dq_flags &= ~XFS_DQ_DIRTY;
                xfs_dqfunlock(dqp);
                return XFS_ERROR(EIO);
        }
        /*
         * Get the buffer containing the on-disk dquot
-         * We don't need a transaction envelope because we know that the
-         * the ondisk-dquot has already been allocated for.
         */
-        if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
+        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+                                   mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+        if (error) {
                ASSERT(error != ENOENT);
-                /*
-                 * Quotas could have gotten turned off (ESRCH)
-                 */
                xfs_dqfunlock(dqp);
-                return (error);
+                return error;
        }
-        if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id),
+        /*
-                           0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
+         * Calculate the location of the dquot inside the buffer.
-                xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE);
+         */
+        ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+        /*
+         * A simple sanity check in case we got a corrupted dquot..
+         */
+        if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+                           XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
+                xfs_buf_relse(bp);
+                xfs_dqfunlock(dqp);
+                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                return XFS_ERROR(EIO);
        }
        /* This is the only portion of data that needs to persist */
-        memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t));
+        memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
        /*
         * Clear the dirty field and remember the flush lsn for later use.
         */
-        dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+        dqp->dq_flags &= ~XFS_DQ_DIRTY;
-        mp = dqp->q_mount;
        xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
                                        &dqp->q_logitem.qli_item.li_lsn);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 9a92407109a1..f8e854b4fde8 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,8 +55,6 @@ uint		ndquot;
 kmem_zone_t     *qm_dqzone;
 kmem_zone_t     *qm_dqtrxzone;
-static cred_t   xfs_zerocr;
 STATIC void     xfs_qm_list_init(xfs_dqlist_t *, char *, int);
 STATIC void     xfs_qm_list_destroy(xfs_dqlist_t *);
@@ -837,7 +835,7 @@ xfs_qm_dqattach_locked(
                        xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
                                                flags & XFS_QMOPT_DQALLOC,
                                                ip->i_udquot, &ip->i_gdquot) :
-                        xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
+                        xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
                                                flags & XFS_QMOPT_DQALLOC,
                                                ip->i_udquot, &ip->i_gdquot);
                /*
@@ -1199,87 +1197,6 @@ xfs_qm_list_destroy(
        mutex_destroy(&(list->qh_lock));
 }
-/*
- * Stripped down version of dqattach. This doesn't attach, or even look at the
- * dquots attached to the inode. The rationale is that there won't be any
- * attached at the time this is called from quotacheck.
- */
-STATIC int
-xfs_qm_dqget_noattach(
-        xfs_inode_t     *ip,
-        xfs_dquot_t     **O_udqpp,
-        xfs_dquot_t     **O_gdqpp)
-{
-        int             error;
-        xfs_mount_t     *mp;
-        xfs_dquot_t     *udqp, *gdqp;
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        mp = ip->i_mount;
-        udqp = NULL;
-        gdqp = NULL;
-        if (XFS_IS_UQUOTA_ON(mp)) {
-                ASSERT(ip->i_udquot == NULL);
-                /*
-                 * We want the dquot allocated if it doesn't exist.
-                 */
-                if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
-                                         XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
-                                         &udqp))) {
-                        /*
-                         * Shouldn't be able to turn off quotas here.
-                         */
-                        ASSERT(error != ESRCH);
-                        ASSERT(error != ENOENT);
-                        return error;
-                }
-                ASSERT(udqp);
-        }
-        if (XFS_IS_OQUOTA_ON(mp)) {
-                ASSERT(ip->i_gdquot == NULL);
-                if (udqp)
-                        xfs_dqunlock(udqp);
-                error = XFS_IS_GQUOTA_ON(mp) ?
-                                xfs_qm_dqget(mp, ip,
-                                             ip->i_d.di_gid, XFS_DQ_GROUP,
-                                             XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
-                                             &gdqp) :
-                                xfs_qm_dqget(mp, ip,
-                                             ip->i_d.di_projid, XFS_DQ_PROJ,
-                                             XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
-                                             &gdqp);
-                if (error) {
-                        if (udqp)
-                                xfs_qm_dqrele(udqp);
-                        ASSERT(error != ESRCH);
-                        ASSERT(error != ENOENT);
-                        return error;
-                }
-                ASSERT(gdqp);
-                /* Reacquire the locks in the right order */
-                if (udqp) {
-                        if (! xfs_qm_dqlock_nowait(udqp)) {
-                                xfs_dqunlock(gdqp);
-                                xfs_dqlock(udqp);
-                                xfs_dqlock(gdqp);
-                        }
-                }
-        }
-        *O_udqpp = udqp;
-        *O_gdqpp = gdqp;
-#ifdef QUOTADEBUG
-        if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
-        if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
-#endif
-        return 0;
-}
 /*
 * Create an inode and return with a reference already taken, but unlocked
 * This is how we create quota inodes
@@ -1305,8 +1222,8 @@ xfs_qm_qino_alloc(
                return error;
        }
-        if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0,
+        error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
-                                   &xfs_zerocr, 0, 1, ip, &committed))) {
+        if (error) {
                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
                                 XFS_TRANS_ABORT);
                return error;
@@ -1516,7 +1433,7 @@ xfs_qm_dqiterate(
                                rablkcnt =  map[i+1].br_blockcount;
                                rablkno = map[i+1].br_startblock;
                                while (rablkcnt--) {
-                                        xfs_baread(mp->m_ddev_targp,
+                                        xfs_buf_readahead(mp->m_ddev_targp,
                                               XFS_FSB_TO_DADDR(mp, rablkno),
                                               mp->m_quotainfo->qi_dqchunklen);
                                        rablkno++;
@@ -1546,18 +1463,34 @@ xfs_qm_dqiterate(
 /*
 * Called by dqusage_adjust in doing a quotacheck.
- * Given the inode, and a dquot (either USR or GRP, doesn't matter),
+ *
- * this updates its incore copy as well as the buffer copy. This is
+ * Given the inode, and a dquot id this updates both the incore dqout as well
- * so that once the quotacheck is done, we can just log all the buffers,
+ * as the buffer copy. This is so that once the quotacheck is done, we can
- * as opposed to logging numerous updates to individual dquots.
+ * just log all the buffers, as opposed to logging numerous updates to
+ * individual dquots.
 */
-STATIC void
+STATIC int
 xfs_qm_quotacheck_dqadjust(
-        xfs_dquot_t             *dqp,
+        struct xfs_inode        *ip,
+        xfs_dqid_t              id,
+        uint                    type,
        xfs_qcnt_t              nblks,
        xfs_qcnt_t              rtblks)
 {
-        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_dquot        *dqp;
+        int                     error;
+        error = xfs_qm_dqget(mp, ip, id, type,
+                             XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
+        if (error) {
+                /*
+                 * Shouldn't be able to turn off quotas here.
+                 */
+                ASSERT(error != ESRCH);
+                ASSERT(error != ENOENT);
+                return error;
+        }
        trace_xfs_dqadjust(dqp);
@@ -1582,11 +1515,13 @@ xfs_qm_quotacheck_dqadjust(
         * There are no timers for the default values set in the root dquot.
         */
        if (dqp->q_core.d_id) {
-                xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
+                xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
-                xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
+                xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
        }
        dqp->dq_flags |= XFS_DQ_DIRTY;
+        xfs_qm_dqput(dqp);
+        return 0;
 }
 STATIC int
@@ -1629,8 +1564,7 @@ xfs_qm_dqusage_adjust(
        int             *res)           /* result code value */
 {
        xfs_inode_t     *ip;
-        xfs_dquot_t     *udqp, *gdqp;
+        xfs_qcnt_t      nblks, rtblks = 0;
-        xfs_qcnt_t      nblks, rtblks;
        int             error;
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -1650,51 +1584,24 @@ xfs_qm_dqusage_adjust(
         * the case in all other instances. It's OK that we do this because
         * quotacheck is done only at mount time.
         */
-        if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) {
+        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
+        if (error) {
                *res = BULKSTAT_RV_NOTHING;
                return error;
        }
-        /*
+        ASSERT(ip->i_delayed_blks == 0);
-         * Obtain the locked dquots. In case of an error (eg. allocation
-         * fails for ENOSPC), we return the negative of the error number
-         * to bulkstat, so that it can get propagated to quotacheck() and
-         * making us disable quotas for the file system.
-         */
-        if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                IRELE(ip);
-                *res = BULKSTAT_RV_GIVEUP;
-                return error;
-        }
-        rtblks = 0;
+        if (XFS_IS_REALTIME_INODE(ip)) {
-        if (! XFS_IS_REALTIME_INODE(ip)) {
-                nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
-        } else {
                /*
                 * Walk thru the extent list and count the realtime blocks.
                 */
-                if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
+                error = xfs_qm_get_rtblks(ip, &rtblks);
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error)
-                        IRELE(ip);
+                        goto error0;
-                        if (udqp)
-                                xfs_qm_dqput(udqp);
-                        if (gdqp)
-                                xfs_qm_dqput(gdqp);
-                        *res = BULKSTAT_RV_GIVEUP;
-                        return error;
-                }
-                nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
        }
-        ASSERT(ip->i_delayed_blks == 0);
-        /*
+        nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
-         * We can't release the inode while holding its dquot locks.
-         * The inode can go into inactive and might try to acquire the dquotlocks.
-         * So, just unlock here and do a vn_rele at the end.
-         */
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        /*
         * Add the (disk blocks and inode) resources occupied by this
@@ -1709,26 +1616,36 @@ xfs_qm_dqusage_adjust(
         * and quotaoffs don't race. (Quotachecks happen at mount time only).
         */
        if (XFS_IS_UQUOTA_ON(mp)) {
-                ASSERT(udqp);
+                error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
-                xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks);
+                                                   XFS_DQ_USER, nblks, rtblks);
-                xfs_qm_dqput(udqp);
+                if (error)
+                        goto error0;
        }
-        if (XFS_IS_OQUOTA_ON(mp)) {
-                ASSERT(gdqp);
+        if (XFS_IS_GQUOTA_ON(mp)) {
-                xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks);
+                error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
-                xfs_qm_dqput(gdqp);
+                                                   XFS_DQ_GROUP, nblks, rtblks);
+                if (error)
+                        goto error0;
        }
-        /*
-         * Now release the inode. This will send it to 'inactive', and
-         * possibly even free blocks.
-         */
-        IRELE(ip);
-        /*
+        if (XFS_IS_PQUOTA_ON(mp)) {
-         * Goto next inode.
+                error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
-         */
+                                                   XFS_DQ_PROJ, nblks, rtblks);
+                if (error)
+                        goto error0;
+        }
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        IRELE(ip);
        *res = BULKSTAT_RV_DIDONE;
        return 0;
+error0:
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        IRELE(ip);
+        *res = BULKSTAT_RV_GIVEUP;
+        return error;
 }
 /*
@@ -2224,7 +2141,7 @@ xfs_qm_write_sb_changes(
 /*
- * Given an inode, a uid and gid (from cred_t) make sure that we have
+ * Given an inode, a uid, gid and prid make sure that we have
 * allocated relevant dquot(s) on disk, and that we won't exceed inode
 * quotas by creating this file.
 * This also attaches dquot(s) to the given inode after locking it,
@@ -2332,7 +2249,7 @@ xfs_qm_vop_dqalloc(
                        xfs_dqunlock(gq);
                }
        } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
-                if (ip->i_d.di_projid != prid) {
+                if (xfs_get_projid(ip) != prid) {
                        xfs_iunlock(ip, lockflags);
                        if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
                                                 XFS_DQ_PROJ,
@@ -2454,7 +2371,7 @@ xfs_qm_vop_chown_reserve(
        }
        if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
                if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
-                     ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id))
+                     xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
                        prjflags = XFS_QMOPT_ENOSPC;
                if (prjflags ||
@@ -2558,7 +2475,7 @@ xfs_qm_vop_create_dqattach(
                ip->i_gdquot = gdqp;
                ASSERT(XFS_IS_OQUOTA_ON(mp));
                ASSERT((XFS_IS_GQUOTA_ON(mp) ?
-                        ip->i_d.di_gid : ip->i_d.di_projid) ==
+                        ip->i_d.di_gid : xfs_get_projid(ip)) ==
                                be32_to_cpu(gdqp->q_core.d_id));
                xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
        }
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bea02d786c5d..45b5cb1788ab 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -81,7 +81,7 @@ xfs_qm_statvfs(
        xfs_mount_t             *mp = ip->i_mount;
        xfs_dquot_t             *dqp;
-        if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) {
+        if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
                xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
                xfs_qm_dqput(dqp);
        }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 45e5849df238..bdebc183223e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -276,7 +276,7 @@ xfs_qm_scall_trunc_qfile(
                goto out_unlock;
        }
-        xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 out_unlock:
@@ -875,21 +875,14 @@ xfs_dqrele_inode(
        struct xfs_perag        *pag,
        int                     flags)
 {
-        int                     error;
        /* skip quota inodes */
        if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
            ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
                ASSERT(ip->i_udquot == NULL);
                ASSERT(ip->i_gdquot == NULL);
-                read_unlock(&pag->pag_ici_lock);
                return 0;
        }
-        error = xfs_sync_inode_valid(ip, pag);
-        if (error)
-                return error;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
                xfs_qm_dqrele(ip->i_udquot);
@@ -900,8 +893,6 @@ xfs_dqrele_inode(
                ip->i_gdquot = NULL;
        }
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        IRELE(ip);
        return 0;
 }
@@ -918,8 +909,7 @@ xfs_qm_dqrele_all_inodes(
        uint             flags)
 {
        ASSERT(mp->m_quotainfo);
-        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
+        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
-                                XFS_ICI_NO_TAG, 0, NULL);
 }
 /*------------------------------------------------------------------------*/
@@ -1175,7 +1165,7 @@ xfs_qm_internalqcheck_adjust(
        }
        xfs_qm_internalqcheck_get_dquots(mp,
                                        (xfs_dqid_t) ip->i_d.di_uid,
-                                        (xfs_dqid_t) ip->i_d.di_projid,
+                                        (xfs_dqid_t) xfs_get_projid(ip),
                                        (xfs_dqid_t) ip->i_d.di_gid,
                                        &ud, &gd);
        if (XFS_IS_UQUOTA_ON(mp)) {
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4917d4eed4ed..63c7a1a6c022 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -230,6 +230,15 @@ typedef struct xfs_perag {
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
        int             pag_ici_reclaimable;    /* reclaimable inodes */
+        struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
+        unsigned long   pag_ici_reclaim_cursor; /* reclaim restart point */
+        /* buffer cache index */
+        spinlock_t      pag_buf_lock;   /* lock for pag_buf_tree */
+        struct rb_root  pag_buf_tree;   /* ordered tree of active buffers */
+        /* for rcu-safe freeing */
+        struct rcu_head rcu_head;
 #endif
        int             pagb_count;     /* pagb slots in use */
 } xfs_perag_t;
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index af168faccc7a..112abc439ca5 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -675,7 +675,7 @@ xfs_alloc_ag_vextent_near(
        xfs_agblock_t   gtbnoa;         /* aligned ... */
        xfs_extlen_t    gtdiff;         /* difference to right side entry */
        xfs_extlen_t    gtlen;          /* length of right side entry */
-        xfs_extlen_t    gtlena;         /* aligned ... */
+        xfs_extlen_t    gtlena = 0;     /* aligned ... */
        xfs_agblock_t   gtnew;          /* useful start bno of right side */
        int             error;          /* error code */
        int             i;              /* result code, temporary */
@@ -684,7 +684,7 @@ xfs_alloc_ag_vextent_near(
        xfs_agblock_t   ltbnoa;         /* aligned ... */
        xfs_extlen_t    ltdiff;         /* difference to left side entry */
        xfs_extlen_t    ltlen;          /* length of left side entry */
-        xfs_extlen_t    ltlena;         /* aligned ... */
+        xfs_extlen_t    ltlena = 0;     /* aligned ... */
        xfs_agblock_t   ltnew;          /* useful start bno of left side */
        xfs_extlen_t    rlen;           /* length of returned extent */
 #if defined(DEBUG) && defined(__KERNEL__)
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 97f7328967fd..3916925e2584 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -280,38 +280,6 @@ xfs_allocbt_key_diff(
        return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
-STATIC int
-xfs_allocbt_kill_root(
-        struct xfs_btree_cur    *cur,
-        struct xfs_buf          *bp,
-        int                     level,
-        union xfs_btree_ptr     *newroot)
-{
-        int                     error;
-        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-        XFS_BTREE_STATS_INC(cur, killroot);
-        /*
-         * Update the root pointer, decreasing the level by 1 and then
-         * free the old root.
-         */
-        xfs_allocbt_set_root(cur, newroot, -1);
-        error = xfs_allocbt_free_block(cur, bp);
-        if (error) {
-                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-                return error;
-        }
-        XFS_BTREE_STATS_INC(cur, free);
-        xfs_btree_setbuf(cur, level, NULL);
-        cur->bc_nlevels--;
-        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-        return 0;
-}
 #ifdef DEBUG
 STATIC int
 xfs_allocbt_keys_inorder(
@@ -423,7 +391,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
        .dup_cursor             = xfs_allocbt_dup_cursor,
        .set_root               = xfs_allocbt_set_root,
-        .kill_root              = xfs_allocbt_kill_root,
        .alloc_block            = xfs_allocbt_alloc_block,
        .free_block             = xfs_allocbt_free_block,
        .update_lastrec         = xfs_allocbt_update_lastrec,
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index c2568242a901..c86375378810 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -355,16 +355,15 @@ xfs_attr_set_int(
                        if (mp->m_flags & XFS_MOUNT_WSYNC) {
                                xfs_trans_set_sync(args.trans);
                        }
+                        if (!error && (flags & ATTR_KERNOTIME) == 0) {
+                                xfs_trans_ichgtime(args.trans, dp,
+                                                        XFS_ICHGTIME_CHG);
+                        }
                        err2 = xfs_trans_commit(args.trans,
                                                 XFS_TRANS_RELEASE_LOG_RES);
                        xfs_iunlock(dp, XFS_ILOCK_EXCL);
-                        /*
-                         * Hit the inode change time.
-                         */
-                        if (!error && (flags & ATTR_KERNOTIME) == 0) {
-                                xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-                        }
                        return(error == 0 ? err2 : error);
                }
@@ -420,6 +419,9 @@ xfs_attr_set_int(
                xfs_trans_set_sync(args.trans);
        }
+        if ((flags & ATTR_KERNOTIME) == 0)
+                xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
        /*
         * Commit the last in the sequence of transactions.
         */
@@ -427,13 +429,6 @@ xfs_attr_set_int(
        error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
-        /*
-         * Hit the inode change time.
-         */
-        if (!error && (flags & ATTR_KERNOTIME) == 0) {
-                xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-        }
        return(error);
 out:
@@ -567,6 +562,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
                xfs_trans_set_sync(args.trans);
        }
+        if ((flags & ATTR_KERNOTIME) == 0)
+                xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
        /*
         * Commit the last in the sequence of transactions.
         */
@@ -574,13 +572,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
        error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
-        /*
-         * Hit the inode change time.
-         */
-        if (!error && (flags & ATTR_KERNOTIME) == 0) {
-                xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-        }
        return(error);
 out:
@@ -1995,7 +1986,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
                        tmp = (valuelen < XFS_BUF_SIZE(bp))
                                ? valuelen : XFS_BUF_SIZE(bp);
-                        xfs_biomove(bp, 0, tmp, dst, XBF_READ);
+                        xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
                        xfs_buf_relse(bp);
                        dst += tmp;
                        valuelen -= tmp;
@@ -2125,9 +2116,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
                                                        XFS_BUF_SIZE(bp);
-                xfs_biomove(bp, 0, tmp, src, XBF_WRITE);
+                xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
                if (tmp < XFS_BUF_SIZE(bp))
-                        xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
+                        xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
                if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
                        return (error);
                }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index f90dadd5a968..8abd12e32e13 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -614,7 +614,7 @@ xfs_bmap_add_extent(
                        nblks += cur->bc_private.b.allocated;
                ASSERT(nblks <= da_old);
                if (nblks < da_old)
-                        xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+                        xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
                                (int64_t)(da_old - nblks), rsvd);
        }
        /*
@@ -1079,7 +1079,8 @@ xfs_bmap_add_extent_delay_real(
                diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
                        (cur ? cur->bc_private.b.allocated : 0));
                if (diff > 0 &&
-                    xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) {
+                    xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+                                             -((int64_t)diff), rsvd)) {
                        /*
                         * Ick gross gag me with a spoon.
                         */
@@ -1089,16 +1090,18 @@ xfs_bmap_add_extent_delay_real(
                                        temp--;
                                        diff--;
                                        if (!diff ||
-                                            !xfs_mod_incore_sb(ip->i_mount,
+                                            !xfs_icsb_modify_counters(ip->i_mount,
-                                                    XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd))
+                                                    XFS_SBS_FDBLOCKS,
+                                                    -((int64_t)diff), rsvd))
                                                break;
                                }
                                if (temp2) {
                                        temp2--;
                                        diff--;
                                        if (!diff ||
-                                            !xfs_mod_incore_sb(ip->i_mount,
+                                            !xfs_icsb_modify_counters(ip->i_mount,
-                                                    XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd))
+                                                    XFS_SBS_FDBLOCKS,
+                                                    -((int64_t)diff), rsvd))
                                                break;
                                }
                        }
@@ -1766,7 +1769,7 @@ xfs_bmap_add_extent_hole_delay(
        }
        if (oldlen != newlen) {
                ASSERT(oldlen > newlen);
-                xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+                xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
                        (int64_t)(oldlen - newlen), rsvd);
                /*
                 * Nothing to do for disk quota accounting here.
@@ -3111,9 +3114,10 @@ xfs_bmap_del_extent(
         * Nothing to do for disk quota accounting here.
         */
        ASSERT(da_old >= da_new);
-        if (da_old > da_new)
+        if (da_old > da_new) {
-                xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new),
+                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                        rsvd);
+                        (int64_t)(da_old - da_new), rsvd);
+        }
 done:
        *logflagsp = flags;
        return error;
@@ -4526,13 +4530,13 @@ xfs_bmapi(
                                                        -((int64_t)extsz), (flags &
                                                        XFS_BMAPI_RSVBLOCKS));
                                } else {
-                                        error = xfs_mod_incore_sb(mp,
+                                        error = xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
                                                        -((int64_t)alen), (flags &
                                                        XFS_BMAPI_RSVBLOCKS));
                                }
                                if (!error) {
-                                        error = xfs_mod_incore_sb(mp,
+                                        error = xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
                                                        -((int64_t)indlen), (flags &
                                                        XFS_BMAPI_RSVBLOCKS));
@@ -4542,7 +4546,7 @@ xfs_bmapi(
                                                        (int64_t)extsz, (flags &
                                                        XFS_BMAPI_RSVBLOCKS));
                                        else if (error)
-                                                xfs_mod_incore_sb(mp,
+                                                xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
                                                        (int64_t)alen, (flags &
                                                        XFS_BMAPI_RSVBLOCKS));
@@ -4744,8 +4748,12 @@ xfs_bmapi(
                 * Check if writing previously allocated but
                 * unwritten extents.
                 */
-                if (wr && mval->br_state == XFS_EXT_UNWRITTEN &&
+                if (wr &&
-                    ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) {
+                    ((mval->br_state == XFS_EXT_UNWRITTEN &&
+                      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
+                     (mval->br_state == XFS_EXT_NORM &&
+                      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
+                                (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
                        /*
                         * Modify (by adding) the state flag, if writing.
                         */
@@ -4757,7 +4765,9 @@ xfs_bmapi(
                                        *firstblock;
                                cur->bc_private.b.flist = flist;
                        }
-                        mval->br_state = XFS_EXT_NORM;
+                        mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+                                                ? XFS_EXT_NORM
+                                                : XFS_EXT_UNWRITTEN;
                        error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
                                firstblock, flist, &tmp_logflags,
                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
@@ -5200,7 +5210,7 @@ xfs_bunmapi(
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_RTBLKS);
                        } else {
-                                xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
+                                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
                                                (int64_t)del.br_blockcount, rsvd);
                                (void)xfs_trans_reserve_quota_nblks(NULL,
                                        ip, -((long)del.br_blockcount), 0,
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index b13569a6179b..71ec9b6ecdfc 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -74,9 +74,12 @@ typedef	struct xfs_bmap_free
 #define XFS_BMAPI_IGSTATE       0x080   /* Ignore state - */
                                        /* combine contig. space */
 #define XFS_BMAPI_CONTIG        0x100   /* must allocate only one extent */
-#define XFS_BMAPI_CONVERT       0x200   /* unwritten extent conversion - */
+/*
-                                        /* need write cache flushing and no */
+ * unwritten extent conversion - this needs write cache flushing and no additional
-                                        /* additional allocation alignments */
+ * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
+ * from written to unwritten, otherwise convert from unwritten to written.
+ */
+#define XFS_BMAPI_CONVERT       0x200
 #define XFS_BMAPI_FLAGS \
        { XFS_BMAPI_WRITE,      "WRITE" }, \
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 829af92f0fba..04f9cca8da7e 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -217,7 +217,7 @@ xfs_btree_del_cursor(
         */
        for (i = 0; i < cur->bc_nlevels; i++) {
                if (cur->bc_bufs[i])
-                        xfs_btree_setbuf(cur, i, NULL);
+                        xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
                else if (!error)
                        break;
        }
@@ -656,7 +656,7 @@ xfs_btree_reada_bufl(
        ASSERT(fsbno != NULLFSBLOCK);
        d = XFS_FSB_TO_DADDR(mp, fsbno);
-        xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
 /*
@@ -676,7 +676,7 @@ xfs_btree_reada_bufs(
        ASSERT(agno != NULLAGNUMBER);
        ASSERT(agbno != NULLAGBLOCK);
        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-        xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
 STATIC int
@@ -763,22 +763,19 @@ xfs_btree_readahead(
 * Set the buffer for level "lev" in the cursor to bp, releasing
 * any previous buffer.
 */
-void
+STATIC void
 xfs_btree_setbuf(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     lev,    /* level in btree */
        xfs_buf_t               *bp)    /* new buffer to set */
 {
        struct xfs_btree_block  *b;     /* btree block */
-        xfs_buf_t               *obp;   /* old buffer pointer */
-        obp = cur->bc_bufs[lev];
+        if (cur->bc_bufs[lev])
-        if (obp)
+                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
-                xfs_trans_brelse(cur->bc_tp, obp);
        cur->bc_bufs[lev] = bp;
        cur->bc_ra[lev] = 0;
-        if (!bp)
-                return;
        b = XFS_BUF_TO_BLOCK(bp);
        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
                if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
@@ -3011,6 +3008,43 @@ out0:
        return 0;
 }
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        int                     level,
+        union xfs_btree_ptr     *newroot)
+{
+        int                     error;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, killroot);
+        /*
+         * Update the root pointer, decreasing the level by 1 and then
+         * free the old root.
+         */
+        cur->bc_ops->set_root(cur, newroot, -1);
+        error = cur->bc_ops->free_block(cur, bp);
+        if (error) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                return error;
+        }
+        XFS_BTREE_STATS_INC(cur, free);
+        cur->bc_bufs[level] = NULL;
+        cur->bc_ra[level] = 0;
+        cur->bc_nlevels--;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+}
 STATIC int
 xfs_btree_dec_cursor(
        struct xfs_btree_cur    *cur,
@@ -3195,7 +3229,7 @@ xfs_btree_delrec(
                         * Make it the new root of the btree.
                         */
                        pp = xfs_btree_ptr_addr(cur, 1, block);
-                        error = cur->bc_ops->kill_root(cur, bp, level, pp);
+                        error = xfs_btree_kill_root(cur, bp, level, pp);
                        if (error)
                                goto error0;
                } else if (level > 0) {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7fa07062bdda..82fafc66bd1f 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -152,9 +152,7 @@ struct xfs_btree_ops {
        /* update btree root pointer */
        void    (*set_root)(struct xfs_btree_cur *cur,
-                                union xfs_btree_ptr *nptr, int level_change);
+                            union xfs_btree_ptr *nptr, int level_change);
-        int     (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
-                                int level, union xfs_btree_ptr *newroot);
        /* block allocation / freeing */
        int     (*alloc_block)(struct xfs_btree_cur *cur,
@@ -399,16 +397,6 @@ xfs_btree_reada_bufs(
        xfs_agblock_t           agbno,  /* allocation group block number */
        xfs_extlen_t            count); /* count of filesystem blocks */
-/*
- * Set the buffer for level "lev" in the cursor to bp, releasing
- * any previous buffer.
- */
-void
-xfs_btree_setbuf(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     lev,    /* level in btree */
-        struct xfs_buf          *bp);   /* new buffer to set */
 /*
 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1b09d7a280df..2686d0d54c5b 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -692,8 +692,7 @@ xfs_buf_item_init(
         * the first.  If we do already have one, there is
         * nothing to do here so return.
         */
-        if (bp->b_mount != mp)
+        ASSERT(bp->b_target->bt_mount == mp);
-                bp->b_mount = mp;
        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
                if (lip->li_type == XFS_LI_BUF) {
@@ -974,7 +973,7 @@ xfs_buf_iodone_callbacks(
                        xfs_buf_do_callbacks(bp, lip);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
-                        xfs_biodone(bp);
+                        xfs_buf_ioend(bp, 0);
                        return;
                }
@@ -1033,7 +1032,7 @@ xfs_buf_iodone_callbacks(
        xfs_buf_do_callbacks(bp, lip);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
-        xfs_biodone(bp);
+        xfs_buf_ioend(bp, 0);
 }
 /*
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 30fa0e206fba..1c00bedb3175 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2042,7 +2042,7 @@ xfs_da_do_buf(
                                mappedbno, nmapped, 0, &bp);
                        break;
                case 3:
-                        xfs_baread(mp->m_ddev_targp, mappedbno, nmapped);
+                        xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped);
                        error = 0;
                        bp = NULL;
                        break;
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5b153b2e6a3..dffba9ba0db6 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -49,8 +49,9 @@ typedef struct xfs_dinode {
        __be32          di_uid;         /* owner's user id */
        __be32          di_gid;         /* owner's group id */
        __be32          di_nlink;       /* number of links to file */
-        __be16          di_projid;      /* owner's project id */
+        __be16          di_projid_lo;   /* lower part of owner's project id */
-        __u8            di_pad[8];      /* unused, zeroed space */
+        __be16          di_projid_hi;   /* higher part owner's project id */
+        __u8            di_pad[6];      /* unused, zeroed space */
        __be16          di_flushiter;   /* incremented on flush */
        xfs_timestamp_t di_atime;       /* time last accessed */
        xfs_timestamp_t di_mtime;       /* time last modified */
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 504be8640e91..ae891223be90 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -961,7 +961,7 @@ xfs_dir2_leaf_getdents(
                                if (i > ra_current &&
                                    map[ra_index].br_blockcount >=
                                    mp->m_dirblkfsbs) {
-                                        xfs_baread(mp->m_ddev_targp,
+                                        xfs_buf_readahead(mp->m_ddev_targp,
                                                XFS_FSB_TO_DADDR(mp,
                                                   map[ra_index].br_startblock +
                                                   ra_offset),
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9b715dce5699..9124425b7f2f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -744,9 +744,15 @@ xfs_filestream_new_ag(
         * If the file's parent directory is known, take its iolock in exclusive
         * mode to prevent two sibling files from racing each other to migrate
         * themselves and their parent to different AGs.
+         *
+         * Note that we lock the parent directory iolock inside the child
+         * iolock here.  That's fine as we never hold both parent and child
+         * iolock in any other place.  This is different from the ilock,
+         * which requires locking of the child after the parent for namespace
+         * operations.
         */
        if (pip)
-                xfs_ilock(pip, XFS_IOLOCK_EXCL);
+                xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
        /*
         * A new AG needs to be found for the file.  If the file's parent
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 87c2e9d02288..8f6fc1a96386 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -293,9 +293,11 @@ typedef struct xfs_bstat {
        __s32           bs_extsize;     /* extent size                  */
        __s32           bs_extents;     /* number of extents            */
        __u32           bs_gen;         /* generation count             */
-        __u16           bs_projid;      /* project id                   */
+        __u16           bs_projid_lo;   /* lower part of project id     */
+#define bs_projid       bs_projid_lo    /* (previously just bs_projid)  */
        __u16           bs_forkoff;     /* inode fork offset in bytes   */
-        unsigned char   bs_pad[12];     /* pad space, unused            */
+        __u16           bs_projid_hi;   /* higher part of project id    */
+        unsigned char   bs_pad[10];     /* pad space, unused            */
        __u32           bs_dmevmask;    /* DMIG event mask              */
        __u16           bs_dmstate;     /* DMIG state info              */
        __u16           bs_aextents;    /* attribute number of extents  */
@@ -448,6 +450,7 @@ typedef struct xfs_handle {
 /*      XFS_IOC_SETBIOSIZE ---- deprecated 46      */
 /*      XFS_IOC_GETBIOSIZE ---- deprecated 47      */
 #define XFS_IOC_GETBMAPX        _IOWR('X', 56, struct getbmap)
+#define XFS_IOC_ZERO_RANGE      _IOW ('X', 57, struct xfs_flock64)
 /*
 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 43b1d5699335..a7c116e814af 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -144,12 +144,11 @@ xfs_growfs_data_private(
        if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
                return error;
        dpct = pct - mp->m_sb.sb_imax_pct;
-        error = xfs_read_buf(mp, mp->m_ddev_targp,
+        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
-                        XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+                                XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
-                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                                BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
-        if (error)
+        if (!bp)
-                return error;
+                return EIO;
-        ASSERT(bp);
        xfs_buf_relse(bp);
        new = nb;       /* use new as a temporary here */
@@ -597,7 +596,8 @@ out:
                 * the extra reserve blocks from the reserve.....
                 */
                int error;
-                error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0);
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                                 fdblks_delta, 0);
                if (error == ENOSPC)
                        goto retry;
        }
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5371d2dc360e..0626a32c3447 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -212,7 +212,7 @@ xfs_ialloc_inode_init(
                 *      to log a whole cluster of inodes instead of all the
                 *      individual transactions causing a lot of log traffic.
                 */
-                xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+                xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
                for (i = 0; i < ninodes; i++) {
                        int     ioffset = i << mp->m_sb.sb_inodelog;
                        uint    isize = sizeof(struct xfs_dinode);
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index d352862cefa0..16921f55c542 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -183,38 +183,6 @@ xfs_inobt_key_diff(
                          cur->bc_rec.i.ir_startino;
 }
-STATIC int
-xfs_inobt_kill_root(
-        struct xfs_btree_cur    *cur,
-        struct xfs_buf          *bp,
-        int                     level,
-        union xfs_btree_ptr     *newroot)
-{
-        int                     error;
-        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-        XFS_BTREE_STATS_INC(cur, killroot);
-        /*
-         * Update the root pointer, decreasing the level by 1 and then
-         * free the old root.
-         */
-        xfs_inobt_set_root(cur, newroot, -1);
-        error = xfs_inobt_free_block(cur, bp);
-        if (error) {
-                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-                return error;
-        }
-        XFS_BTREE_STATS_INC(cur, free);
-        cur->bc_bufs[level] = NULL;
-        cur->bc_nlevels--;
-        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-        return 0;
-}
 #ifdef DEBUG
 STATIC int
 xfs_inobt_keys_inorder(
@@ -309,7 +277,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
        .dup_cursor             = xfs_inobt_dup_cursor,
        .set_root               = xfs_inobt_set_root,
-        .kill_root              = xfs_inobt_kill_root,
        .alloc_block            = xfs_inobt_alloc_block,
        .free_block             = xfs_inobt_free_block,
        .get_minrecs            = xfs_inobt_get_minrecs,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b1ecc6f97ade..0cdd26932d8e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -365,8 +365,8 @@ xfs_iget(
        xfs_perag_t     *pag;
        xfs_agino_t     agino;
-        /* the radix tree exists only in inode capable AGs */
+        /* reject inode numbers outside existing AGs */
-        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return EINVAL;
        /* get the perag structure and ensure that it's inode capable */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 34798f391c49..108c7a085f94 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -660,7 +660,8 @@ xfs_dinode_from_disk(
        to->di_uid = be32_to_cpu(from->di_uid);
        to->di_gid = be32_to_cpu(from->di_gid);
        to->di_nlink = be32_to_cpu(from->di_nlink);
-        to->di_projid = be16_to_cpu(from->di_projid);
+        to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+        to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
        to->di_flushiter = be16_to_cpu(from->di_flushiter);
        to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
@@ -695,7 +696,8 @@ xfs_dinode_to_disk(
        to->di_uid = cpu_to_be32(from->di_uid);
        to->di_gid = cpu_to_be32(from->di_gid);
        to->di_nlink = cpu_to_be32(from->di_nlink);
-        to->di_projid = cpu_to_be16(from->di_projid);
+        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
        to->di_flushiter = cpu_to_be16(from->di_flushiter);
        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
@@ -874,7 +876,7 @@ xfs_iread(
        if (ip->i_d.di_version == 1) {
                ip->i_d.di_nlink = ip->i_d.di_onlink;
                ip->i_d.di_onlink = 0;
-                ip->i_d.di_projid = 0;
+                xfs_set_projid(ip, 0);
        }
        ip->i_delayed_blks = 0;
@@ -982,8 +984,7 @@ xfs_ialloc(
        mode_t          mode,
        xfs_nlink_t     nlink,
        xfs_dev_t       rdev,
-        cred_t          *cr,
+        prid_t          prid,
-        xfs_prid_t      prid,
        int             okalloc,
        xfs_buf_t       **ialloc_context,
        boolean_t       *call_again,
@@ -1027,7 +1028,7 @@ xfs_ialloc(
        ASSERT(ip->i_d.di_nlink == nlink);
        ip->i_d.di_uid = current_fsuid();
        ip->i_d.di_gid = current_fsgid();
-        ip->i_d.di_projid = prid;
+        xfs_set_projid(ip, prid);
        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
        /*
@@ -2725,7 +2726,7 @@ cluster_corrupt_out:
                        XFS_BUF_UNDONE(bp);
                        XFS_BUF_STALE(bp);
                        XFS_BUF_ERROR(bp,EIO);
-                        xfs_biodone(bp);
+                        xfs_buf_ioend(bp, 0);
                } else {
                        XFS_BUF_STALE(bp);
                        xfs_buf_relse(bp);
@@ -3008,7 +3009,7 @@ xfs_iflush_int(
                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
                        memset(&(dip->di_pad[0]), 0,
                              sizeof(dip->di_pad));
-                        ASSERT(ip->i_d.di_projid == 0);
+                        ASSERT(xfs_get_projid(ip) == 0);
                }
        }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0898c5417d12..fb2ca2e4cdc9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -134,8 +134,9 @@ typedef struct xfs_icdinode {
        __uint32_t      di_uid;         /* owner's user id */
        __uint32_t      di_gid;         /* owner's group id */
        __uint32_t      di_nlink;       /* number of links to file */
-        __uint16_t      di_projid;      /* owner's project id */
+        __uint16_t      di_projid_lo;   /* lower part of owner's project id */
-        __uint8_t       di_pad[8];      /* unused, zeroed space */
+        __uint16_t      di_projid_hi;   /* higher part of owner's project id */
+        __uint8_t       di_pad[6];      /* unused, zeroed space */
        __uint16_t      di_flushiter;   /* incremented on flush */
        xfs_ictimestamp_t di_atime;     /* time last accessed */
        xfs_ictimestamp_t di_mtime;     /* time last modified */
@@ -212,7 +213,6 @@ typedef struct xfs_icdinode {
 #ifdef __KERNEL__
 struct bhv_desc;
-struct cred;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -335,6 +335,25 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 }
 /*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was choosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline prid_t
+xfs_get_projid(struct xfs_inode *ip)
+{
+        return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
+}
+static inline void
+xfs_set_projid(struct xfs_inode *ip,
+                prid_t projid)
+{
+        ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
+        ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
+}
+/*
 * Manage the i_flush queue embedded in the inode.  This completion
 * queue synchronizes processes attempting to flush the in-core
 * inode back to disk.
@@ -456,8 +475,8 @@ void		xfs_inode_free(struct xfs_inode *ip);
 * xfs_inode.c prototypes.
 */
 int             xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
-                           xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t,
+                           xfs_nlink_t, xfs_dev_t, prid_t, int,
-                           int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
+                           struct xfs_buf **, boolean_t *, xfs_inode_t **);
 uint            xfs_ip2xflags(struct xfs_inode *);
 uint            xfs_dic2xflags(struct xfs_dinode *);
@@ -471,7 +490,6 @@ int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 void            xfs_iext_realloc(xfs_inode_t *, int, int);
 void            xfs_iunpin_wait(xfs_inode_t *);
 int             xfs_iflush(xfs_inode_t *, uint);
-void            xfs_ichgtime(xfs_inode_t *, int);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
 void            xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
@@ -482,7 +500,7 @@ void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
 #define IHOLD(ip) \
 do { \
        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-        atomic_inc(&(VFS_I(ip)->i_count)); \
+        ihold(VFS_I(ip)); \
        trace_xfs_ihold(ip, _THIS_IP_); \
 } while (0)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fe00777e2796..c7ac020705df 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -223,15 +223,6 @@ xfs_inode_item_format(
        nvecs        = 1;
        /*
-         * Make sure the linux inode is dirty. We do this before
-         * clearing i_update_core as the VFS will call back into
-         * XFS here and set i_update_core, so we need to dirty the
-         * inode first so that the ordering of i_update_core and
-         * unlogged modifications still works as described below.
-         */
-        xfs_mark_inode_dirty_sync(ip);
-        /*
         * Clear i_update_core if the timestamps (or any other
         * non-transactional modification) need flushing/logging
         * and we're about to log them with the rest of the core.
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7e3626e5925c..dc1882adaf54 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -92,7 +92,8 @@ xfs_bulkstat_one_int(
         * further change.
         */
        buf->bs_nlink = dic->di_nlink;
-        buf->bs_projid = dic->di_projid;
+        buf->bs_projid_lo = dic->di_projid_lo;
+        buf->bs_projid_hi = dic->di_projid_hi;
        buf->bs_ino = ino;
        buf->bs_mode = dic->di_mode;
        buf->bs_uid = dic->di_uid;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 33f718f92a48..cee4ab9f8a9e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -917,19 +917,6 @@ xlog_iodone(xfs_buf_t *bp)
        l = iclog->ic_log;
        /*
-         * If the _XFS_BARRIER_FAILED flag was set by a lower
-         * layer, it means the underlying device no longer supports
-         * barrier I/O. Warn loudly and turn off barriers.
-         */
-        if (bp->b_flags & _XFS_BARRIER_FAILED) {
-                bp->b_flags &= ~_XFS_BARRIER_FAILED;
-                l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
-                xfs_fs_cmn_err(CE_WARN, l->l_mp,
-                                "xlog_iodone: Barriers are no longer supported"
-                                " by device. Disabling barriers\n");
-        }
-        /*
         * Race to shutdown the filesystem if we see an error.
         */
        if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
@@ -1131,7 +1118,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
                iclog->ic_prev = prev_iclog;
                prev_iclog = iclog;
-                bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp);
+                bp = xfs_buf_get_uncached(mp->m_logdev_targp,
+                                                log->l_iclog_size, 0);
                if (!bp)
                        goto out_free_iclog;
                if (!XFS_BUF_CPSEMA(bp))
@@ -1309,7 +1297,7 @@ xlog_bdstrat(
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                XFS_BUF_ERROR(bp, EIO);
                XFS_BUF_STALE(bp);
-                xfs_biodone(bp);
+                xfs_buf_ioend(bp, 0);
                /*
                 * It would seem logical to return EIO here, but we rely on
                 * the log state machine to propagate I/O errors instead of
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7e206fc1fa36..23d6ceb5e97b 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -146,102 +146,6 @@ xlog_cil_init_post_recovery(
 }
 /*
- * Insert the log item into the CIL and calculate the difference in space
- * consumed by the item. Add the space to the checkpoint ticket and calculate
- * if the change requires additional log metadata. If it does, take that space
- * as well. Remove the amount of space we addded to the checkpoint ticket from
- * the current transaction ticket so that the accounting works out correctly.
- *
- * If this is the first time the item is being placed into the CIL in this
- * context, pin it so it can't be written to disk until the CIL is flushed to
- * the iclog and the iclog written to disk.
- */
-static void
-xlog_cil_insert(
-        struct log              *log,
-        struct xlog_ticket      *ticket,
-        struct xfs_log_item     *item,
-        struct xfs_log_vec      *lv)
-{
-        struct xfs_cil          *cil = log->l_cilp;
-        struct xfs_log_vec      *old = lv->lv_item->li_lv;
-        struct xfs_cil_ctx      *ctx = cil->xc_ctx;
-        int                     len;
-        int                     diff_iovecs;
-        int                     iclog_space;
-        if (old) {
-                /* existing lv on log item, space used is a delta */
-                ASSERT(!list_empty(&item->li_cil));
-                ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
-                len = lv->lv_buf_len - old->lv_buf_len;
-                diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
-                kmem_free(old->lv_buf);
-                kmem_free(old);
-        } else {
-                /* new lv, must pin the log item */
-                ASSERT(!lv->lv_item->li_lv);
-                ASSERT(list_empty(&item->li_cil));
-                len = lv->lv_buf_len;
-                diff_iovecs = lv->lv_niovecs;
-                IOP_PIN(lv->lv_item);
-        }
-        len += diff_iovecs * sizeof(xlog_op_header_t);
-        /* attach new log vector to log item */
-        lv->lv_item->li_lv = lv;
-        spin_lock(&cil->xc_cil_lock);
-        list_move_tail(&item->li_cil, &cil->xc_cil);
-        ctx->nvecs += diff_iovecs;
-        /*
-         * If this is the first time the item is being committed to the CIL,
-         * store the sequence number on the log item so we can tell
-         * in future commits whether this is the first checkpoint the item is
-         * being committed into.
-         */
-        if (!item->li_seq)
-                item->li_seq = ctx->sequence;
-        /*
-         * Now transfer enough transaction reservation to the context ticket
-         * for the checkpoint. The context ticket is special - the unit
-         * reservation has to grow as well as the current reservation as we
-         * steal from tickets so we can correctly determine the space used
-         * during the transaction commit.
-         */
-        if (ctx->ticket->t_curr_res == 0) {
-                /* first commit in checkpoint, steal the header reservation */
-                ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
-                ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
-                ticket->t_curr_res -= ctx->ticket->t_unit_res;
-        }
-        /* do we need space for more log record headers? */
-        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
-        if (len > 0 && (ctx->space_used / iclog_space !=
-                                (ctx->space_used + len) / iclog_space)) {
-                int hdrs;
-                hdrs = (len + iclog_space - 1) / iclog_space;
-                /* need to take into account split region headers, too */
-                hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
-                ctx->ticket->t_unit_res += hdrs;
-                ctx->ticket->t_curr_res += hdrs;
-                ticket->t_curr_res -= hdrs;
-                ASSERT(ticket->t_curr_res >= len);
-        }
-        ticket->t_curr_res -= len;
-        ctx->space_used += len;
-        spin_unlock(&cil->xc_cil_lock);
-}
-/*
 * Format log item into a flat buffers
 *
 * For delayed logging, we need to hold a formatted buffer containing all the
@@ -286,7 +190,7 @@ xlog_cil_format_items(
                        len += lv->lv_iovecp[index].i_len;
                lv->lv_buf_len = len;
-                lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+                lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
                ptr = lv->lv_buf;
                for (index = 0; index < lv->lv_niovecs; index++) {
@@ -300,21 +204,136 @@ xlog_cil_format_items(
        }
 }
+/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space and vectors it will consume, and if it is a new item pin it as
+ * well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+        struct log              *log,
+        struct xfs_log_vec      *lv,
+        int                     *len,
+        int                     *diff_iovecs)
+{
+        struct xfs_log_vec      *old = lv->lv_item->li_lv;
+        if (old) {
+                /* existing lv on log item, space used is a delta */
+                ASSERT(!list_empty(&lv->lv_item->li_cil));
+                ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+                *len += lv->lv_buf_len - old->lv_buf_len;
+                *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
+                kmem_free(old->lv_buf);
+                kmem_free(old);
+        } else {
+                /* new lv, must pin the log item */
+                ASSERT(!lv->lv_item->li_lv);
+                ASSERT(list_empty(&lv->lv_item->li_cil));
+                *len += lv->lv_buf_len;
+                *diff_iovecs += lv->lv_niovecs;
+                IOP_PIN(lv->lv_item);
+        }
+        /* attach new log vector to log item */
+        lv->lv_item->li_lv = lv;
+        /*
+         * If this is the first time the item is being committed to the
+         * CIL, store the sequence number on the log item so we can
+         * tell in future commits whether this is the first checkpoint
+         * the item is being committed into.
+         */
+        if (!lv->lv_item->li_seq)
+                lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+}
+/*
+ * Insert the log items into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ */
 static void
 xlog_cil_insert_items(
        struct log              *log,
        struct xfs_log_vec      *log_vector,
-        struct xlog_ticket      *ticket,
+        struct xlog_ticket      *ticket)
-        xfs_lsn_t               *start_lsn)
 {
-        struct xfs_log_vec *lv;
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_cil_ctx      *ctx = cil->xc_ctx;
-        if (start_lsn)
+        struct xfs_log_vec      *lv;
-                *start_lsn = log->l_cilp->xc_ctx->sequence;
+        int                     len = 0;
+        int                     diff_iovecs = 0;
+        int                     iclog_space;
        ASSERT(log_vector);
+        /*
+         * Do all the accounting aggregation and switching of log vectors
+         * around in a separate loop to the insertion of items into the CIL.
+         * Then we can do a separate loop to update the CIL within a single
+         * lock/unlock pair. This reduces the number of round trips on the CIL
+         * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
+         * hold time for the transaction commit.
+         *
+         * If this is the first time the item is being placed into the CIL in
+         * this context, pin it so it can't be written to disk until the CIL is
+         * flushed to the iclog and the iclog written to disk.
+         *
+         * We can do this safely because the context can't checkpoint until we
+         * are done so it doesn't matter exactly how we update the CIL.
+         */
+        for (lv = log_vector; lv; lv = lv->lv_next)
+                xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+        /* account for space used by new iovec headers  */
+        len += diff_iovecs * sizeof(xlog_op_header_t);
+        spin_lock(&cil->xc_cil_lock);
+        /* move the items to the tail of the CIL */
        for (lv = log_vector; lv; lv = lv->lv_next)
-                xlog_cil_insert(log, ticket, lv->lv_item, lv);
+                list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+        ctx->nvecs += diff_iovecs;
+        /*
+         * Now transfer enough transaction reservation to the context ticket
+         * for the checkpoint. The context ticket is special - the unit
+         * reservation has to grow as well as the current reservation as we
+         * steal from tickets so we can correctly determine the space used
+         * during the transaction commit.
+         */
+        if (ctx->ticket->t_curr_res == 0) {
+                /* first commit in checkpoint, steal the header reservation */
+                ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+                ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+                ticket->t_curr_res -= ctx->ticket->t_unit_res;
+        }
+        /* do we need space for more log record headers? */
+        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+        if (len > 0 && (ctx->space_used / iclog_space !=
+                                (ctx->space_used + len) / iclog_space)) {
+                int hdrs;
+                hdrs = (len + iclog_space - 1) / iclog_space;
+                /* need to take into account split region headers, too */
+                hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+                ctx->ticket->t_unit_res += hdrs;
+                ctx->ticket->t_curr_res += hdrs;
+                ticket->t_curr_res -= hdrs;
+                ASSERT(ticket->t_curr_res >= len);
+        }
+        ticket->t_curr_res -= len;
+        ctx->space_used += len;
+        spin_unlock(&cil->xc_cil_lock);
 }
 static void
@@ -638,7 +657,10 @@ xfs_log_commit_cil(
        /* lock out background commit */
        down_read(&log->l_cilp->xc_ctx_lock);
-        xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
+        if (commit_lsn)
+                *commit_lsn = log->l_cilp->xc_ctx->sequence;
+        xlog_cil_insert_items(log, log_vector, tp->t_ticket);
        /* check we didn't blow the reservation */
        if (tp->t_ticket->t_curr_res < 0)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6f3f5fa37acf..966d3f97458c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -107,7 +107,8 @@ xlog_get_bp(
                nbblks += log->l_sectBBsize;
        nbblks = round_up(nbblks, log->l_sectBBsize);
-        return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
+        return xfs_buf_get_uncached(log->l_mp->m_logdev_targp,
+                                        BBTOB(nbblks), 0);
 }
 STATIC void
@@ -167,7 +168,7 @@ xlog_bread_noalign(
        XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
        xfsbdstrat(log->l_mp, bp);
-        error = xfs_iowait(bp);
+        error = xfs_buf_iowait(bp);
        if (error)
                xfs_ioerror_alert("xlog_bread", log->l_mp,
                                  bp, XFS_BUF_ADDR(bp));
@@ -321,12 +322,13 @@ xlog_recover_iodone(
                 * this during recovery. One strike!
                 */
                xfs_ioerror_alert("xlog_recover_iodone",
-                                  bp->b_mount, bp, XFS_BUF_ADDR(bp));
+                                        bp->b_target->bt_mount, bp,
-                xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
+                                        XFS_BUF_ADDR(bp));
+                xfs_force_shutdown(bp->b_target->bt_mount,
+                                        SHUTDOWN_META_IO_ERROR);
        }
-        bp->b_mount = NULL;
        XFS_BUF_CLR_IODONE_FUNC(bp);
-        xfs_biodone(bp);
+        xfs_buf_ioend(bp, 0);
 }
 /*
@@ -2275,8 +2277,7 @@ xlog_recover_do_buffer_trans(
                XFS_BUF_STALE(bp);
                error = xfs_bwrite(mp, bp);
        } else {
-                ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+                ASSERT(bp->b_target->bt_mount == mp);
-                bp->b_mount = mp;
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
                xfs_bdwrite(mp, bp);
        }
@@ -2540,8 +2541,7 @@ xlog_recover_do_inode_trans(
        }
 write_inode_buffer:
-        ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+        ASSERT(bp->b_target->bt_mount == mp);
-        bp->b_mount = mp;
        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
        xfs_bdwrite(mp, bp);
 error:
@@ -2678,8 +2678,7 @@ xlog_recover_do_dquot_trans(
        memcpy(ddq, recddq, item->ri_buf[1].i_len);
        ASSERT(dq_f->qlf_size == 2);
-        ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+        ASSERT(bp->b_target->bt_mount == mp);
-        bp->b_mount = mp;
        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
        xfs_bdwrite(mp, bp);
@@ -3817,7 +3816,7 @@ xlog_do_recover(
        XFS_BUF_READ(bp);
        XFS_BUF_UNASYNC(bp);
        xfsbdstrat(log->l_mp, bp);
-        error = xfs_iowait(bp);
+        error = xfs_buf_iowait(bp);
        if (error) {
                xfs_ioerror_alert("xlog_do_recover",
                                  log->l_mp, bp, XFS_BUF_ADDR(bp));
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aeb9d72ebf6e..19e9dfa1c254 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -52,16 +52,11 @@ STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
                                                int);
 STATIC void     xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
                                                int);
-STATIC int      xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
-                                                int64_t, int);
 STATIC void     xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
 #else
 #define xfs_icsb_balance_counter(mp, a, b)              do { } while (0)
 #define xfs_icsb_balance_counter_locked(mp, a, b)       do { } while (0)
-#define xfs_icsb_modify_counters(mp, a, b, c)           do { } while (0)
 #endif
 static const struct {
@@ -199,6 +194,8 @@ xfs_uuid_unmount(
 /*
 * Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
 */
 struct xfs_perag *
 xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
@@ -206,19 +203,43 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
        struct xfs_perag        *pag;
        int                     ref = 0;
-        spin_lock(&mp->m_perag_lock);
+        rcu_read_lock();
        pag = radix_tree_lookup(&mp->m_perag_tree, agno);
        if (pag) {
                ASSERT(atomic_read(&pag->pag_ref) >= 0);
-                /* catch leaks in the positive direction during testing */
-                ASSERT(atomic_read(&pag->pag_ref) < 1000);
                ref = atomic_inc_return(&pag->pag_ref);
        }
-        spin_unlock(&mp->m_perag_lock);
+        rcu_read_unlock();
        trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
        return pag;
 }
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          first,
+        int                     tag)
+{
+        struct xfs_perag        *pag;
+        int                     found;
+        int                     ref;
+        rcu_read_lock();
+        found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+                                        (void **)&pag, first, 1, tag);
+        if (found <= 0) {
+                rcu_read_unlock();
+                return NULL;
+        }
+        ref = atomic_inc_return(&pag->pag_ref);
+        rcu_read_unlock();
+        trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+        return pag;
+}
 void
 xfs_perag_put(struct xfs_perag *pag)
 {
@@ -229,10 +250,18 @@ xfs_perag_put(struct xfs_perag *pag)
        trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
 }
+STATIC void
+__xfs_free_perag(
+        struct rcu_head *head)
+{
+        struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+        ASSERT(atomic_read(&pag->pag_ref) == 0);
+        kmem_free(pag);
+}
 /*
- * Free up the resources associated with a mount structure.  Assume that
+ * Free up the per-ag resources associated with the mount structure.
- * the structure was initially zeroed, so we can tell which fields got
- * initialized.
 */
 STATIC void
 xfs_free_perag(
@@ -244,10 +273,10 @@ xfs_free_perag(
        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
                spin_lock(&mp->m_perag_lock);
                pag = radix_tree_delete(&mp->m_perag_tree, agno);
+                spin_unlock(&mp->m_perag_lock);
                ASSERT(pag);
                ASSERT(atomic_read(&pag->pag_ref) == 0);
-                spin_unlock(&mp->m_perag_lock);
+                call_rcu(&pag->rcu_head, __xfs_free_perag);
-                kmem_free(pag);
        }
 }
@@ -444,7 +473,10 @@ xfs_initialize_perag(
                pag->pag_agno = index;
                pag->pag_mount = mp;
                rwlock_init(&pag->pag_ici_lock);
+                mutex_init(&pag->pag_ici_reclaim_lock);
                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+                spin_lock_init(&pag->pag_buf_lock);
+                pag->pag_buf_tree = RB_ROOT;
                if (radix_tree_preload(GFP_NOFS))
                        goto out_unwind;
@@ -639,7 +671,6 @@ int
 xfs_readsb(xfs_mount_t *mp, int flags)
 {
        unsigned int    sector_size;
-        unsigned int    extra_flags;
        xfs_buf_t       *bp;
        int             error;
@@ -652,28 +683,24 @@ xfs_readsb(xfs_mount_t *mp, int flags)
         * access to the superblock.
         */
        sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
-        extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
-        bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
+reread:
-                          extra_flags);
+        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
-        if (!bp || XFS_BUF_ISERROR(bp)) {
+                                        XFS_SB_DADDR, sector_size, 0);
-                xfs_fs_mount_cmn_err(flags, "SB read failed");
+        if (!bp) {
-                error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
+                xfs_fs_mount_cmn_err(flags, "SB buffer read failed");
-                goto fail;
+                return EIO;
        }
-        ASSERT(XFS_BUF_ISBUSY(bp));
-        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
        /*
         * Initialize the mount structure from the superblock.
         * But first do some basic consistency checking.
         */
        xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
        error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
        if (error) {
                xfs_fs_mount_cmn_err(flags, "SB validate failed");
-                goto fail;
+                goto release_buf;
        }
        /*
@@ -684,7 +711,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
                        "device supports only %u byte sectors (not %u)",
                        sector_size, mp->m_sb.sb_sectsize);
                error = ENOSYS;
-                goto fail;
+                goto release_buf;
        }
        /*
@@ -692,33 +719,20 @@ xfs_readsb(xfs_mount_t *mp, int flags)
         * re-read the superblock so the buffer is correctly sized.
         */
        if (sector_size < mp->m_sb.sb_sectsize) {
-                XFS_BUF_UNMANAGE(bp);
                xfs_buf_relse(bp);
                sector_size = mp->m_sb.sb_sectsize;
-                bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR,
+                goto reread;
-                                  BTOBB(sector_size), extra_flags);
-                if (!bp || XFS_BUF_ISERROR(bp)) {
-                        xfs_fs_mount_cmn_err(flags, "SB re-read failed");
-                        error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
-                        goto fail;
-                }
-                ASSERT(XFS_BUF_ISBUSY(bp));
-                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
        }
        /* Initialize per-cpu counters */
        xfs_icsb_reinit_counters(mp);
        mp->m_sb_bp = bp;
-        xfs_buf_relse(bp);
+        xfs_buf_unlock(bp);
-        ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
        return 0;
- fail:
+release_buf:
-        if (bp) {
+        xfs_buf_relse(bp);
-                XFS_BUF_UNMANAGE(bp);
-                xfs_buf_relse(bp);
-        }
        return error;
 }
@@ -991,42 +1005,35 @@ xfs_check_sizes(xfs_mount_t *mp)
 {
        xfs_buf_t       *bp;
        xfs_daddr_t     d;
-        int             error;
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
-                cmn_err(CE_WARN, "XFS: size check 1 failed");
+                cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
                return XFS_ERROR(EFBIG);
        }
-        error = xfs_read_buf(mp, mp->m_ddev_targp,
+        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
-                             d - XFS_FSS_TO_BB(mp, 1),
+                                        d - XFS_FSS_TO_BB(mp, 1),
-                             XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                                        BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
-        if (!error) {
+        if (!bp) {
-                xfs_buf_relse(bp);
+                cmn_err(CE_WARN, "XFS: last sector read failed");
-        } else {
+                return EIO;
-                cmn_err(CE_WARN, "XFS: size check 2 failed");
-                if (error == ENOSPC)
-                        error = XFS_ERROR(EFBIG);
-                return error;
        }
+        xfs_buf_relse(bp);
        if (mp->m_logdev_targp != mp->m_ddev_targp) {
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-                        cmn_err(CE_WARN, "XFS: size check 3 failed");
+                        cmn_err(CE_WARN, "XFS: log size mismatch detected");
                        return XFS_ERROR(EFBIG);
                }
-                error = xfs_read_buf(mp, mp->m_logdev_targp,
+                bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
-                                     d - XFS_FSB_TO_BB(mp, 1),
+                                        d - XFS_FSB_TO_BB(mp, 1),
-                                     XFS_FSB_TO_BB(mp, 1), 0, &bp);
+                                        XFS_FSB_TO_B(mp, 1), 0);
-                if (!error) {
+                if (!bp) {
-                        xfs_buf_relse(bp);
+                        cmn_err(CE_WARN, "XFS: log device read failed");
-                } else {
+                        return EIO;
-                        cmn_err(CE_WARN, "XFS: size check 3 failed");
-                        if (error == ENOSPC)
-                                error = XFS_ERROR(EFBIG);
-                        return error;
                }
+                xfs_buf_relse(bp);
        }
        return 0;
 }
@@ -1601,7 +1608,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                XFS_BUF_UNASYNC(sbp);
                ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
                xfsbdstrat(mp, sbp);
-                error = xfs_iowait(sbp);
+                error = xfs_buf_iowait(sbp);
                if (error)
                        xfs_ioerror_alert("xfs_unmountfs_writesb",
                                          mp, sbp, XFS_BUF_ADDR(sbp));
@@ -1832,135 +1839,72 @@ xfs_mod_incore_sb_unlocked(
 */
 int
 xfs_mod_incore_sb(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        xfs_sb_field_t  field,
+        xfs_sb_field_t          field,
-        int64_t         delta,
+        int64_t                 delta,
-        int             rsvd)
+        int                     rsvd)
 {
-        int     status;
+        int                     status;
-        /* check for per-cpu counters */
-        switch (field) {
 #ifdef HAVE_PERCPU_SB
-        case XFS_SBS_ICOUNT:
+        ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
-        case XFS_SBS_IFREE:
-        case XFS_SBS_FDBLOCKS:
-                if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-                        status = xfs_icsb_modify_counters(mp, field,
-                                                        delta, rsvd);
-                        break;
-                }
-                /* FALLTHROUGH */
 #endif
-        default:
+        spin_lock(&mp->m_sb_lock);
-                spin_lock(&mp->m_sb_lock);
+        status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
-                status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+        spin_unlock(&mp->m_sb_lock);
-                spin_unlock(&mp->m_sb_lock);
-                break;
-        }
        return status;
 }
 /*
- * xfs_mod_incore_sb_batch() is used to change more than one field
+ * Change more than one field in the in-core superblock structure at a time.
- * in the in-core superblock structure at a time.  This modification
- * is protected by a lock internal to this module.  The fields and
- * changes to those fields are specified in the array of xfs_mod_sb
- * structures passed in.
 *
- * Either all of the specified deltas will be applied or none of
+ * The fields and changes to those fields are specified in the array of
- * them will.  If any modified field dips below 0, then all modifications
+ * xfs_mod_sb structures passed in.  Either all of the specified deltas
- * will be backed out and EINVAL will be returned.
+ * will be applied or none of them will.  If any modified field dips below 0,
+ * then all modifications will be backed out and EINVAL will be returned.
+ *
+ * Note that this function may not be used for the superblock values that
+ * are tracked with the in-memory per-cpu counters - a direct call to
+ * xfs_icsb_modify_counters is required for these.
 */
 int
-xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
+xfs_mod_incore_sb_batch(
+        struct xfs_mount        *mp,
+        xfs_mod_sb_t            *msb,
+        uint                    nmsb,
+        int                     rsvd)
 {
-        int             status=0;
+        xfs_mod_sb_t            *msbp = &msb[0];
-        xfs_mod_sb_t    *msbp;
+        int                     error = 0;
        /*
-         * Loop through the array of mod structures and apply each
+         * Loop through the array of mod structures and apply each individually.
-         * individually.  If any fail, then back out all those
+         * If any fail, then back out all those which have already been applied.
-         * which have already been applied.  Do all of this within
+         * Do all of this within the scope of the m_sb_lock so that all of the
-         * the scope of the m_sb_lock so that all of the changes will
+         * changes will be atomic.
-         * be atomic.
         */
        spin_lock(&mp->m_sb_lock);
-        msbp = &msb[0];
        for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
-                /*
+                ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
-                 * Apply the delta at index n.  If it fails, break
+                       msbp->msb_field > XFS_SBS_FDBLOCKS);
-                 * from the loop so we'll fall into the undo loop
-                 * below.
-                 */
-                switch (msbp->msb_field) {
-#ifdef HAVE_PERCPU_SB
-                case XFS_SBS_ICOUNT:
-                case XFS_SBS_IFREE:
-                case XFS_SBS_FDBLOCKS:
-                        if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-                                spin_unlock(&mp->m_sb_lock);
-                                status = xfs_icsb_modify_counters(mp,
-                                                        msbp->msb_field,
-                                                        msbp->msb_delta, rsvd);
-                                spin_lock(&mp->m_sb_lock);
-                                break;
-                        }
-                        /* FALLTHROUGH */
-#endif
-                default:
-                        status = xfs_mod_incore_sb_unlocked(mp,
-                                                msbp->msb_field,
-                                                msbp->msb_delta, rsvd);
-                        break;
-                }
-                if (status != 0) {
+                error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
-                        break;
+                                                   msbp->msb_delta, rsvd);
-                }
+                if (error)
+                        goto unwind;
        }
+        spin_unlock(&mp->m_sb_lock);
+        return 0;
-        /*
+unwind:
-         * If we didn't complete the loop above, then back out
+        while (--msbp >= msb) {
-         * any changes made to the superblock.  If you add code
+                error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
-         * between the loop above and here, make sure that you
+                                                   -msbp->msb_delta, rsvd);
-         * preserve the value of status. Loop back until
+                ASSERT(error == 0);
-         * we step below the beginning of the array.  Make sure
-         * we don't touch anything back there.
-         */
-        if (status != 0) {
-                msbp--;
-                while (msbp >= msb) {
-                        switch (msbp->msb_field) {
-#ifdef HAVE_PERCPU_SB
-                        case XFS_SBS_ICOUNT:
-                        case XFS_SBS_IFREE:
-                        case XFS_SBS_FDBLOCKS:
-                                if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-                                        spin_unlock(&mp->m_sb_lock);
-                                        status = xfs_icsb_modify_counters(mp,
-                                                        msbp->msb_field,
-                                                        -(msbp->msb_delta),
-                                                        rsvd);
-                                        spin_lock(&mp->m_sb_lock);
-                                        break;
-                                }
-                                /* FALLTHROUGH */
-#endif
-                        default:
-                                status = xfs_mod_incore_sb_unlocked(mp,
-                                                        msbp->msb_field,
-                                                        -(msbp->msb_delta),
-                                                        rsvd);
-                                break;
-                        }
-                        ASSERT(status == 0);
-                        msbp--;
-                }
        }
        spin_unlock(&mp->m_sb_lock);
-        return status;
+        return error;
 }
 /*
@@ -1998,18 +1942,13 @@ xfs_getsb(
 */
 void
 xfs_freesb(
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        xfs_buf_t       *bp;
+        struct xfs_buf          *bp = mp->m_sb_bp;
-        /*
+        xfs_buf_lock(bp);
-         * Use xfs_getsb() so that the buffer will be locked
-         * when we call xfs_buf_relse().
-         */
-        bp = xfs_getsb(mp, 0);
-        XFS_BUF_UNMANAGE(bp);
-        xfs_buf_relse(bp);
        mp->m_sb_bp = NULL;
+        xfs_buf_relse(bp);
 }
 /*
@@ -2496,7 +2435,7 @@ xfs_icsb_balance_counter(
        spin_unlock(&mp->m_sb_lock);
 }
-STATIC int
+int
 xfs_icsb_modify_counters(
        xfs_mount_t     *mp,
        xfs_sb_field_t  field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 622da2179a57..5861b4980740 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,7 +53,6 @@ typedef struct xfs_trans_reservations {
 #include "xfs_sync.h"
-struct cred;
 struct log;
 struct xfs_mount_args;
 struct xfs_inode;
@@ -91,6 +90,8 @@ extern void	xfs_icsb_reinit_counters(struct xfs_mount *);
 extern void     xfs_icsb_destroy_counters(struct xfs_mount *);
 extern void     xfs_icsb_sync_counters(struct xfs_mount *, int);
 extern void     xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
+extern int      xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
+                                                int64_t, int);
 #else
 #define xfs_icsb_init_counters(mp)              (0)
@@ -98,6 +99,8 @@ extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 #define xfs_icsb_reinit_counters(mp)            do { } while (0)
 #define xfs_icsb_sync_counters(mp, flags)       do { } while (0)
 #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
+#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
+        xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif
 typedef struct xfs_mount {
@@ -232,8 +235,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_DIRSYNC       (1ULL << 21)    /* synchronous directory ops */
 #define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22)    /* don't report large preferred
                                                 * I/O size in stat() */
-#define XFS_MOUNT_NO_PERCPU_SB  (1ULL << 23)    /* don't use per-cpu superblock
-                                                   counters */
 #define XFS_MOUNT_FILESTREAMS   (1ULL << 24)    /* enable the filestreams
                                                   allocator */
 #define XFS_MOUNT_NOATTR2       (1ULL << 25)    /* disable use of attr2 format */
@@ -327,6 +328,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 * perag get/put wrappers for ref counting
 */
 struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
+                                        int tag);
 void    xfs_perag_put(struct xfs_perag *pag);
 /*
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0e64b113bd6..9bb6eda4cd21 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
 #define xfs_trans_apply_dquot_deltas(tp)
 #define xfs_trans_unreserve_and_mod_dquots(tp)
-#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags)      (0)
+static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
-#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl)      (0)
+                struct xfs_inode *ip, long nblks, long ninos, uint flags)
+{
+        return 0;
+}
+static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
+                struct xfs_mount *mp, struct xfs_dquot *udqp,
+                struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+{
+        return 0;
+}
 #define xfs_qm_vop_create_dqattach(tp, ip, u, g)
 #define xfs_qm_vop_rename_dqattach(it)                                  (0)
 #define xfs_qm_vop_chown(tp, ip, old, new)                              (NULL)
@@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_qm_dqdetach(ip)
 #define xfs_qm_dqrele(d)
 #define xfs_qm_statvfs(ip, s)
-#define xfs_qm_sync(mp, fl)                                             (0)
+static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
+{
+        return 0;
+}
 #define xfs_qm_newmount(mp, a, b)                                       (0)
 #define xfs_qm_mount_quotas(mp)
 #define xfs_qm_unmount(mp)
-#define xfs_qm_unmount_quotas(mp)                                       (0)
+#define xfs_qm_unmount_quotas(mp)
 #endif /* CONFIG_XFS_QUOTA */
 #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h
deleted file mode 100644
index 2dec79edb510..000000000000
--- a/fs/xfs/xfs_refcache.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_REFCACHE_H__
-#define __XFS_REFCACHE_H__
-#ifdef HAVE_REFCACHE
-/*
- * Maximum size (in inodes) for the NFS reference cache
- */
-#define XFS_REFCACHE_SIZE_MAX   512
-struct xfs_inode;
-struct xfs_mount;
-extern void xfs_refcache_insert(struct xfs_inode *);
-extern void xfs_refcache_purge_ip(struct xfs_inode *);
-extern void xfs_refcache_purge_mp(struct xfs_mount *);
-extern void xfs_refcache_purge_some(struct xfs_mount *);
-extern void xfs_refcache_resize(int);
-extern void xfs_refcache_destroy(void);
-extern void xfs_refcache_iunlock(struct xfs_inode *, uint);
-#else
-#define xfs_refcache_insert(ip)         do { } while (0)
-#define xfs_refcache_purge_ip(ip)       do { } while (0)
-#define xfs_refcache_purge_mp(mp)       do { } while (0)
-#define xfs_refcache_purge_some(mp)     do { } while (0)
-#define xfs_refcache_resize(size)       do { } while (0)
-#define xfs_refcache_destroy()          do { } while (0)
-#define xfs_refcache_iunlock(ip, flags) xfs_iunlock(ip, flags)
-#endif
-#endif  /* __XFS_REFCACHE_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 8fca957200df..d2af0a8381a6 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -183,7 +183,7 @@ xfs_rename(
         * tree quota mechanism would be circumvented.
         */
        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
+                     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
                error = XFS_ERROR(EXDEV);
                goto error_return;
        }
@@ -211,7 +211,9 @@ xfs_rename(
                        goto error_return;
                if (error)
                        goto abort_return;
-                xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                xfs_trans_ichgtime(tp, target_dp,
+                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
                if (new_parent && src_is_directory) {
                        error = xfs_bumplink(tp, target_dp);
@@ -249,7 +251,9 @@ xfs_rename(
                                        &first_block, &free_list, spaceres);
                if (error)
                        goto abort_return;
-                xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                xfs_trans_ichgtime(tp, target_dp,
+                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
                /*
                 * Decrement the link count on the target since the target
@@ -292,7 +296,7 @@ xfs_rename(
         * inode isn't really being changed, but old unix file systems did
         * it and some incremental backup programs won't work without it.
         */
-        xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
        /*
         * Adjust the link count on src_dp.  This is necessary when
@@ -315,7 +319,7 @@ xfs_rename(
        if (error)
                goto abort_return;
-        xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
        if (new_parent)
                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 891260fea11e..12a191385310 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -39,6 +39,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_buf.h"
 /*
@@ -1883,13 +1884,13 @@ xfs_growfs_rt(
        /*
         * Read in the last block of the device, make sure it exists.
         */
-        error = xfs_read_buf(mp, mp->m_rtdev_targp,
+        bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
-                        XFS_FSB_TO_BB(mp, nrblocks - 1),
+                                XFS_FSB_TO_BB(mp, nrblocks - 1),
-                        XFS_FSB_TO_BB(mp, 1), 0, &bp);
+                                XFS_FSB_TO_B(mp, 1), 0);
-        if (error)
+        if (!bp)
-                return error;
+                return EIO;
-        ASSERT(bp);
        xfs_buf_relse(bp);
        /*
         * Calculate new parameters.  These are the final values to be reached.
         */
@@ -2215,7 +2216,6 @@ xfs_rtmount_init(
 {
        xfs_buf_t       *bp;    /* buffer for last block of subvolume */
        xfs_daddr_t     d;      /* address of last block of subvolume */
-        int             error;  /* error return value */
        xfs_sb_t        *sbp;   /* filesystem superblock copy in mount */
        sbp = &mp->m_sb;
@@ -2242,15 +2242,12 @@ xfs_rtmount_init(
                        (unsigned long long) mp->m_sb.sb_rblocks);
                return XFS_ERROR(EFBIG);
        }
-        error = xfs_read_buf(mp, mp->m_rtdev_targp,
+        bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
-                                d - XFS_FSB_TO_BB(mp, 1),
+                                        d - XFS_FSB_TO_BB(mp, 1),
-                                XFS_FSB_TO_BB(mp, 1), 0, &bp);
+                                        XFS_FSB_TO_B(mp, 1), 0);
-        if (error) {
+        if (!bp) {
-                cmn_err(CE_WARN,
+                cmn_err(CE_WARN, "XFS: realtime device size check failed");
-        "XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
+                return EIO;
-                if (error == ENOSPC)
-                        return XFS_ERROR(EFBIG);
-                return error;
        }
        xfs_buf_relse(bp);
        return 0;
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1b017c657494..1eb2ba586814 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -80,10 +80,12 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_RESERVED4BIT    0x00000004
 #define XFS_SB_VERSION2_ATTR2BIT        0x00000008      /* Inline attr rework */
 #define XFS_SB_VERSION2_PARENTBIT       0x00000010      /* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT     0x00000080      /* 32 bit project id */
 #define XFS_SB_VERSION2_OKREALFBITS     \
        (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
-         XFS_SB_VERSION2_ATTR2BIT)
+         XFS_SB_VERSION2_ATTR2BIT       | \
+         XFS_SB_VERSION2_PROJID32BIT)
 #define XFS_SB_VERSION2_OKSASHFBITS     \
        (0)
 #define XFS_SB_VERSION2_OKREALBITS      \
@@ -495,6 +497,12 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
                sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
 }
+static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
+{
+        return xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
+}
 /*
 * end of superblock version macros
 */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 1c47edaea0d2..f6d956b7711e 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -696,7 +696,7 @@ xfs_trans_reserve(
         * fail if the count would go below zero.
         */
        if (blocks > 0) {
-                error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+                error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
                                          -((int64_t)blocks), rsvd);
                if (error != 0) {
                        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -767,7 +767,7 @@ undo_log:
 undo_blocks:
        if (blocks > 0) {
-                (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+                xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
                                         (int64_t)blocks, rsvd);
                tp->t_blk_res = 0;
        }
@@ -1009,7 +1009,7 @@ void
 xfs_trans_unreserve_and_mod_sb(
        xfs_trans_t     *tp)
 {
-        xfs_mod_sb_t    msb[14];        /* If you add cases, add entries */
+        xfs_mod_sb_t    msb[9]; /* If you add cases, add entries */
        xfs_mod_sb_t    *msbp;
        xfs_mount_t     *mp = tp->t_mountp;
        /* REFERENCED */
@@ -1017,55 +1017,61 @@ xfs_trans_unreserve_and_mod_sb(
        int             rsvd;
        int64_t         blkdelta = 0;
        int64_t         rtxdelta = 0;
+        int64_t         idelta = 0;
+        int64_t         ifreedelta = 0;
        msbp = msb;
        rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
-        /* calculate free blocks delta */
+        /* calculate deltas */
        if (tp->t_blk_res > 0)
                blkdelta = tp->t_blk_res;
        if ((tp->t_fdblocks_delta != 0) &&
            (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
             (tp->t_flags & XFS_TRANS_SB_DIRTY)))
                blkdelta += tp->t_fdblocks_delta;
-        if (blkdelta != 0) {
-                msbp->msb_field = XFS_SBS_FDBLOCKS;
-                msbp->msb_delta = blkdelta;
-                msbp++;
-        }
-        /* calculate free realtime extents delta */
        if (tp->t_rtx_res > 0)
                rtxdelta = tp->t_rtx_res;
        if ((tp->t_frextents_delta != 0) &&
            (tp->t_flags & XFS_TRANS_SB_DIRTY))
                rtxdelta += tp->t_frextents_delta;
+        if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+             (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
+                idelta = tp->t_icount_delta;
+                ifreedelta = tp->t_ifree_delta;
+        }
+        /* apply the per-cpu counters */
+        if (blkdelta) {
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                                 blkdelta, rsvd);
+                if (error)
+                        goto out;
+        }
+        if (idelta) {
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
+                                                 idelta, rsvd);
+                if (error)
+                        goto out_undo_fdblocks;
+        }
+        if (ifreedelta) {
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
+                                                 ifreedelta, rsvd);
+                if (error)
+                        goto out_undo_icount;
+        }
+        /* apply remaining deltas */
        if (rtxdelta != 0) {
                msbp->msb_field = XFS_SBS_FREXTENTS;
                msbp->msb_delta = rtxdelta;
                msbp++;
        }
-        /* apply remaining deltas */
-        if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
-             (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
-                if (tp->t_icount_delta != 0) {
-                        msbp->msb_field = XFS_SBS_ICOUNT;
-                        msbp->msb_delta = tp->t_icount_delta;
-                        msbp++;
-                }
-                if (tp->t_ifree_delta != 0) {
-                        msbp->msb_field = XFS_SBS_IFREE;
-                        msbp->msb_delta = tp->t_ifree_delta;
-                        msbp++;
-                }
-        }
        if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
                if (tp->t_dblocks_delta != 0) {
                        msbp->msb_field = XFS_SBS_DBLOCKS;
@@ -1115,8 +1121,24 @@ xfs_trans_unreserve_and_mod_sb(
        if (msbp > msb) {
                error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
                        (uint)(msbp - msb), rsvd);
-                ASSERT(error == 0);
+                if (error)
+                        goto out_undo_ifreecount;
        }
+        return;
+out_undo_ifreecount:
+        if (ifreedelta)
+                xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
+out_undo_icount:
+        if (idelta)
+                xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
+out_undo_fdblocks:
+        if (blkdelta)
+                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
+out:
+        ASSERT(error = 0);
+        return;
 }
 /*
@@ -1389,15 +1411,12 @@ xfs_trans_item_committed(
 */
 STATIC void
 xfs_trans_committed(
-        struct xfs_trans        *tp,
+        void                    *arg,
        int                     abortflag)
 {
+        struct xfs_trans        *tp = arg;
        struct xfs_log_item_desc *lidp, *next;
-        /* Call the transaction's completion callback if there is one. */
-        if (tp->t_callback != NULL)
-                tp->t_callback(tp, tp->t_callarg);
        list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
                xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
                xfs_trans_free_item_desc(lidp);
@@ -1525,7 +1544,7 @@ xfs_trans_commit_iclog(
         * running in simulation mode (the log is explicitly turned
         * off).
         */
-        tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed;
+        tp->t_logcb.cb_func = xfs_trans_committed;
        tp->t_logcb.cb_arg = tp;
        /*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c13c0f97b494..246286b77a86 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -399,8 +399,6 @@ typedef struct xfs_trans {
                                                 * transaction. */
        struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
        struct xfs_dquot_acct   *t_dqinfo;      /* acctg info for dquots */
-        xfs_trans_callback_t    t_callback;     /* transaction callback */
-        void                    *t_callarg;     /* callback arg */
        unsigned int            t_flags;        /* misc flags */
        int64_t                 t_icount_delta; /* superblock icount change */
        int64_t                 t_ifree_delta;  /* superblock ifree change */
@@ -473,6 +471,7 @@ void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void            xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 int             xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
                               xfs_ino_t , uint, uint, struct xfs_inode **);
+void            xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
 void            xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
 void            xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
 void            xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 90af025e6839..c47918c302a5 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -336,7 +336,7 @@ xfs_trans_read_buf(
                        ASSERT(!XFS_BUF_ISASYNC(bp));
                        XFS_BUF_READ(bp);
                        xfsbdstrat(tp->t_mountp, bp);
-                        error = xfs_iowait(bp);
+                        error = xfs_buf_iowait(bp);
                        if (error) {
                                xfs_ioerror_alert("xfs_trans_read_buf", mp,
                                                  bp, blkno);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index cdc53a1050c5..ccb34532768b 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -118,6 +118,36 @@ xfs_trans_ijoin_ref(
 }
 /*
+ * Transactional inode timestamp update. Requires the inode to be locked and
+ * joined to the transaction supplied. Relies on the transaction subsystem to
+ * track dirty state and update/writeback the inode accordingly.
+ */
+void
+xfs_trans_ichgtime(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *ip,
+        int                     flags)
+{
+        struct inode            *inode = VFS_I(ip);
+        timespec_t              tv;
+        ASSERT(tp);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        ASSERT(ip->i_transp == tp);
+        tv = current_fs_time(inode->i_sb);
+        if ((flags & XFS_ICHGTIME_MOD) &&
+            !timespec_equal(&inode->i_mtime, &tv)) {
+                inode->i_mtime = tv;
+        }
+        if ((flags & XFS_ICHGTIME_CHG) &&
+            !timespec_equal(&inode->i_ctime, &tv)) {
+                inode->i_ctime = tv;
+        }
+}
+/*
 * This is called to mark the fields indicated in fieldmask as needing
 * to be logged when the transaction is committed.  The inode must
 * already be associated with the given transaction.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 320775295e32..26d1867d8156 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,8 +73,6 @@ typedef	__int32_t	xfs_tid_t;	/* transaction identifier */
 typedef __uint32_t      xfs_dablk_t;    /* dir/attr block number (in file) */
 typedef __uint32_t      xfs_dahash_t;   /* dir/attr hash value */
-typedef __uint16_t      xfs_prid_t;     /* prid_t truncated to 16bits in XFS */
 typedef __uint32_t      xlog_tid_t;     /* transaction ID type */
 /*
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index b7d5769d2df0..8b32d1a4c5a1 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -56,7 +56,6 @@ xfs_dir_ialloc(
        mode_t          mode,
        xfs_nlink_t     nlink,
        xfs_dev_t       rdev,
-        cred_t          *credp,
        prid_t          prid,           /* project id */
        int             okalloc,        /* ok to allocate new space */
        xfs_inode_t     **ipp,          /* pointer to inode; it will be
@@ -93,7 +92,7 @@ xfs_dir_ialloc(
         * transaction commit so that no other process can steal
         * the inode(s) that we've just allocated.
         */
-        code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc,
+        code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
                          &ialloc_context, &call_again, &ip);
        /*
@@ -197,7 +196,7 @@ xfs_dir_ialloc(
                 * other allocations in this allocation group,
                 * this call should always succeed.
                 */
-                code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid,
+                code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
                                  okalloc, &ialloc_context, &call_again, &ip);
                /*
@@ -235,7 +234,7 @@ xfs_droplink(
 {
        int     error;
-        xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
        ASSERT (ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink--;
@@ -299,7 +298,7 @@ xfs_bumplink(
 {
        if (ip->i_d.di_nlink >= XFS_MAXLINK)
                return XFS_ERROR(EMLINK);
-        xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
        ASSERT(ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink++;
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f55b9678264f..456fca314933 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -19,8 +19,7 @@
 #define __XFS_UTILS_H__
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
-                                xfs_dev_t, cred_t *, prid_t, int,
+                                xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
-                                xfs_inode_t **, int *);
 extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
 extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
 extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4c7c7bfb2b2f..8e4a63c4151a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -114,7 +114,7 @@ xfs_setattr(
                 */
                ASSERT(udqp == NULL);
                ASSERT(gdqp == NULL);
-                code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid,
+                code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
                                         qflags, &udqp, &gdqp);
                if (code)
                        return code;
@@ -184,8 +184,11 @@ xfs_setattr(
                    ip->i_size == 0 && ip->i_d.di_nextents == 0) {
                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
                        lock_flags &= ~XFS_ILOCK_EXCL;
-                        if (mask & ATTR_CTIME)
+                        if (mask & ATTR_CTIME) {
-                                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                                inode->i_mtime = inode->i_ctime =
+                                                current_fs_time(inode->i_sb);
+                                xfs_mark_inode_dirty_sync(ip);
+                        }
                        code = 0;
                        goto error_return;
                }
@@ -1253,8 +1256,7 @@ xfs_create(
        struct xfs_name         *name,
        mode_t                  mode,
        xfs_dev_t               rdev,
-        xfs_inode_t             **ipp,
+        xfs_inode_t             **ipp)
-        cred_t                  *credp)
 {
        int                     is_dir = S_ISDIR(mode);
        struct xfs_mount        *mp = dp->i_mount;
@@ -1266,7 +1268,7 @@ xfs_create(
        boolean_t               unlock_dp_on_error = B_FALSE;
        uint                    cancel_flags;
        int                     committed;
-        xfs_prid_t              prid;
+        prid_t                  prid;
        struct xfs_dquot        *udqp = NULL;
        struct xfs_dquot        *gdqp = NULL;
        uint                    resblks;
@@ -1279,9 +1281,9 @@ xfs_create(
                return XFS_ERROR(EIO);
        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-                prid = dp->i_d.di_projid;
+                prid = xfs_get_projid(dp);
        else
-                prid = dfltprid;
+                prid = XFS_PROJID_DEFAULT;
        /*
         * Make sure that we have allocated dquot(s) on disk.
@@ -1360,7 +1362,7 @@ xfs_create(
         * entry pointing to them, but a directory also the "." entry
         * pointing to itself.
         */
-        error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp,
+        error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
                               prid, resblks > 0, &ip, &committed);
        if (error) {
                if (error == ENOSPC)
@@ -1391,7 +1393,7 @@ xfs_create(
                ASSERT(error != ENOSPC);
                goto out_trans_abort;
        }
-        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        if (is_dir) {
@@ -1742,7 +1744,7 @@ xfs_remove(
                ASSERT(error != ENOENT);
                goto out_bmap_cancel;
        }
-        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        if (is_dir) {
                /*
@@ -1880,7 +1882,7 @@ xfs_link(
         * the tree quota mechanism could be circumvented.
         */
        if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                     (tdp->i_d.di_projid != sip->i_d.di_projid))) {
+                     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
                error = XFS_ERROR(EXDEV);
                goto error_return;
        }
@@ -1895,7 +1897,7 @@ xfs_link(
                                        &first_block, &free_list, resblks);
        if (error)
                goto abort_return;
-        xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
        error = xfs_bumplink(tp, sip);
@@ -1933,8 +1935,7 @@ xfs_symlink(
        struct xfs_name         *link_name,
        const char              *target_path,
        mode_t                  mode,
-        xfs_inode_t             **ipp,
+        xfs_inode_t             **ipp)
-        cred_t                  *credp)
 {
        xfs_mount_t             *mp = dp->i_mount;
        xfs_trans_t             *tp;
@@ -1955,7 +1956,7 @@ xfs_symlink(
        int                     byte_cnt;
        int                     n;
        xfs_buf_t               *bp;
-        xfs_prid_t              prid;
+        prid_t                  prid;
        struct xfs_dquot        *udqp, *gdqp;
        uint                    resblks;
@@ -1978,9 +1979,9 @@ xfs_symlink(
        udqp = gdqp = NULL;
        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-                prid = dp->i_d.di_projid;
+                prid = xfs_get_projid(dp);
        else
-                prid = (xfs_prid_t)dfltprid;
+                prid = XFS_PROJID_DEFAULT;
        /*
         * Make sure that we have allocated dquot(s) on disk.
@@ -2046,8 +2047,8 @@ xfs_symlink(
        /*
         * Allocate an inode for the symlink.
         */
-        error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
+        error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
-                               1, 0, credp, prid, resblks > 0, &ip, NULL);
+                               prid, resblks > 0, &ip, NULL);
        if (error) {
                if (error == ENOSPC)
                        goto error_return;
@@ -2129,7 +2130,7 @@ xfs_symlink(
                                        &first_block, &free_list, resblks);
        if (error)
                goto error1;
-        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        /*
@@ -2272,7 +2273,7 @@ xfs_alloc_file_space(
        count = len;
        imapp = &imaps[0];
        nimaps = 1;
-        bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
+        bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
        startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
        allocatesize_fsb = XFS_B_TO_FSB(mp, count);
@@ -2431,9 +2432,9 @@ xfs_zero_remaining_bytes(
        if (endoff > ip->i_size)
                endoff = ip->i_size;
-        bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
+        bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
-                                XFS_IS_REALTIME_INODE(ip) ?
+                                        mp->m_rtdev_targp : mp->m_ddev_targp,
-                                mp->m_rtdev_targp : mp->m_ddev_targp);
+                                mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
        if (!bp)
                return XFS_ERROR(ENOMEM);
@@ -2459,7 +2460,7 @@ xfs_zero_remaining_bytes(
                XFS_BUF_READ(bp);
                XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
                xfsbdstrat(mp, bp);
-                error = xfs_iowait(bp);
+                error = xfs_buf_iowait(bp);
                if (error) {
                        xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
                                          mp, bp, XFS_BUF_ADDR(bp));
@@ -2472,7 +2473,7 @@ xfs_zero_remaining_bytes(
                XFS_BUF_UNREAD(bp);
                XFS_BUF_WRITE(bp);
                xfsbdstrat(mp, bp);
-                error = xfs_iowait(bp);
+                error = xfs_buf_iowait(bp);
                if (error) {
                        xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
                                          mp, bp, XFS_BUF_ADDR(bp));
@@ -2711,6 +2712,7 @@ xfs_change_file_space(
        xfs_off_t       llen;
        xfs_trans_t     *tp;
        struct iattr    iattr;
+        int             prealloc_type;
        if (!S_ISREG(ip->i_d.di_mode))
                return XFS_ERROR(EINVAL);
@@ -2753,12 +2755,17 @@ xfs_change_file_space(
         * size to be changed.
         */
        setprealloc = clrprealloc = 0;
+        prealloc_type = XFS_BMAPI_PREALLOC;
        switch (cmd) {
+        case XFS_IOC_ZERO_RANGE:
+                prealloc_type |= XFS_BMAPI_CONVERT;
+                xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
+                /* FALLTHRU */
        case XFS_IOC_RESVSP:
        case XFS_IOC_RESVSP64:
                error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
-                                                                1, attr_flags);
+                                                prealloc_type, attr_flags);
                if (error)
                        return error;
                setprealloc = 1;
@@ -2827,7 +2834,7 @@ xfs_change_file_space(
                if (ip->i_d.di_mode & S_IXGRP)
                        ip->i_d.di_mode &= ~S_ISGID;
-                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        }
        if (setprealloc)
                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index d8dfa8d0dadd..f6702927eee4 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -2,7 +2,6 @@
 #define _XFS_VNODEOPS_H 1
 struct attrlist_cursor_kern;
-struct cred;
 struct file;
 struct iattr;
 struct inode;
@@ -26,7 +25,7 @@ int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
                struct xfs_inode **ipp, struct xfs_name *ci_name);
 int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
-                xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp);
+                xfs_dev_t rdev, struct xfs_inode **ipp);
 int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
                struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
@@ -34,8 +33,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 int xfs_readdir(struct xfs_inode        *dp, void *dirent, size_t bufsize,
                       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
-                const char *target_path, mode_t mode, struct xfs_inode **ipp,
+                const char *target_path, mode_t mode, struct xfs_inode **ipp);
-                cred_t *credp);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
                xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
author	Thomas Gleixner <tglx@linutronix.de>	2010-12-09 12:17:25 -0500
committer	Thomas Gleixner <tglx@linutronix.de>	2010-12-09 12:17:25 -0500
commit	d834a9dcecae834cd6b2bc5e50e1907738d9cf6a (patch)
tree	0589d753465d3fe359ba451ba6cb7798df03aaa2 /fs
parent	a38c5380ef9f088be9f49b6e4c5d80af8b1b5cd4 (diff)
parent	f658bcfb2607bf0808966a69cf74135ce98e5c2d (diff)