Merge remote-tracking branch 'linus/master' into testing

author: Sage Weil <sage@inktank.com> 2013-08-15 14:11:45 -0400
committer: Sage Weil <sage@inktank.com> 2013-08-15 14:11:45 -0400
commit: ee3e542fec6e69bc9fb668698889a37d93950ddf (patch)
tree: e74ee766a4764769ef1d3d45d266b4dea64101d3 /fs
parent: fe2a801b50c0bb8039d627e5ae1fec249d10ff39 (diff)
parent: f1d6e17f540af37bb1891480143669ba7636c4cf (diff)
470 files changed, 16631 insertions, 11991 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 55abfd62654a..6489e1fc1afd 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -31,3 +31,16 @@ config 9P_FS_POSIX_ACL
          If you don't know what Access Control Lists are, say N
 endif
+config 9P_FS_SECURITY
+        bool "9P Security Labels"
+        depends on 9P_FS
+        help
+          Security labels support alternative access control models
+          implemented by security modules like SELinux.  This option
+          enables an extended attribute handler for file security
+          labels in the 9P filesystem.
+          If you are not using a security module that requires using
+          extended attributes for file security labels, say N.
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index ab8c12780634..ff7be98f84f2 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -11,7 +11,9 @@ obj-$(CONFIG_9P_FS) := 9p.o
        v9fs.o \
        fid.o  \
        xattr.o \
-        xattr_user.o
+        xattr_user.o \
+        xattr_trusted.o
 9p-$(CONFIG_9P_FSCACHE) += cache.o
 9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
+9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 055562c580b4..9ff073f4090a 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -148,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
 * @offset: offset in the page
 */
-static void v9fs_invalidate_page(struct page *page, unsigned long offset)
+static void v9fs_invalidate_page(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        /*
         * If called with zero offset, we should release
         * the private state assocated with the page
         */
-        if (offset == 0)
+        if (offset == 0 && length == PAGE_CACHE_SIZE)
                v9fs_fscache_invalidate_page(page);
 }
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index be1e34adc3c6..4d0c2e0be7e5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -101,16 +101,15 @@ static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
 }
 /**
- * v9fs_dir_readdir - read a directory
+ * v9fs_dir_readdir - iterate through a directory
- * @filp: opened file structure
+ * @file: opened file structure
- * @dirent: directory structure ???
+ * @ctx: actor we feed the entries to
- * @filldir: function to populate directory structure ???
 *
 */
-static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
 {
-        int over;
+        bool over;
        struct p9_wstat st;
        int err = 0;
        struct p9_fid *fid;
@@ -118,19 +117,19 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int reclen = 0;
        struct p9_rdir *rdir;
-        p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+        p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
-        fid = filp->private_data;
+        fid = file->private_data;
        buflen = fid->clnt->msize - P9_IOHDRSZ;
-        rdir = v9fs_alloc_rdir_buf(filp, buflen);
+        rdir = v9fs_alloc_rdir_buf(file, buflen);
        if (!rdir)
                return -ENOMEM;
        while (1) {
                if (rdir->tail == rdir->head) {
-                        err = v9fs_file_readn(filp, rdir->buf, NULL,
+                        err = v9fs_file_readn(file, rdir->buf, NULL,
-                                                        buflen, filp->f_pos);
+                                                        buflen, ctx->pos);
                        if (err <= 0)
                                return err;
@@ -148,51 +147,45 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        }
                        reclen = st.size+2;
-                        over = filldir(dirent, st.name, strlen(st.name),
+                        over = !dir_emit(ctx, st.name, strlen(st.name),
-                            filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
+                                         v9fs_qid2ino(&st.qid), dt_type(&st));
                        p9stat_free(&st);
                        if (over)
                                return 0;
                        rdir->head += reclen;
-                        filp->f_pos += reclen;
+                        ctx->pos += reclen;
                }
        }
 }
 /**
- * v9fs_dir_readdir_dotl - read a directory
+ * v9fs_dir_readdir_dotl - iterate through a directory
- * @filp: opened file structure
+ * @file: opened file structure
- * @dirent: buffer to fill dirent structures
+ * @ctx: actor we feed the entries to
- * @filldir: function to populate dirent structures
 *
 */
-static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
+static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
-                                                filldir_t filldir)
 {
-        int over;
        int err = 0;
        struct p9_fid *fid;
        int buflen;
        struct p9_rdir *rdir;
        struct p9_dirent curdirent;
-        u64 oldoffset = 0;
-        p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+        p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
-        fid = filp->private_data;
+        fid = file->private_data;
        buflen = fid->clnt->msize - P9_READDIRHDRSZ;
-        rdir = v9fs_alloc_rdir_buf(filp, buflen);
+        rdir = v9fs_alloc_rdir_buf(file, buflen);
        if (!rdir)
                return -ENOMEM;
        while (1) {
                if (rdir->tail == rdir->head) {
                        err = p9_client_readdir(fid, rdir->buf, buflen,
-                                                filp->f_pos);
+                                                ctx->pos);
                        if (err <= 0)
                                return err;
@@ -210,22 +203,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
                                return -EIO;
                        }
-                        /* d_off in dirent structure tracks the offset into
+                        if (!dir_emit(ctx, curdirent.d_name,
-                         * the next dirent in the dir. However, filldir()
+                                      strlen(curdirent.d_name),
-                         * expects offset into the current dirent. Hence
+                                      v9fs_qid2ino(&curdirent.qid),
-                         * while calling filldir send the offset from the
+                                      curdirent.d_type))
-                         * previous dirent structure.
-                         */
-                        over = filldir(dirent, curdirent.d_name,
-                                        strlen(curdirent.d_name),
-                                        oldoffset, v9fs_qid2ino(&curdirent.qid),
-                                        curdirent.d_type);
-                        oldoffset = curdirent.d_off;
-                        if (over)
                                return 0;
-                        filp->f_pos = curdirent.d_off;
+                        ctx->pos = curdirent.d_off;
                        rdir->head += err;
                }
        }
@@ -254,7 +238,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 const struct file_operations v9fs_dir_operations = {
        .read = generic_read_dir,
        .llseek = generic_file_llseek,
-        .readdir = v9fs_dir_readdir,
+        .iterate = v9fs_dir_readdir,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
 };
@@ -262,7 +246,7 @@ const struct file_operations v9fs_dir_operations = {
 const struct file_operations v9fs_dir_operations_dotl = {
        .read = generic_read_dir,
        .llseek = generic_file_llseek,
-        .readdir = v9fs_dir_readdir_dotl,
+        .iterate = v9fs_dir_readdir_dotl,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
        .fsync = v9fs_file_fsync_dotl,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index d86edc8d3fd0..25b018efb8ab 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1054,13 +1054,11 @@ static int
 v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
                 struct kstat *stat)
 {
-        int err;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
        struct p9_wstat *st;
        p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
-        err = -EPERM;
        v9ses = v9fs_dentry2v9ses(dentry);
        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
                generic_fillattr(dentry->d_inode, stat);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index c45e016b190f..3c28cdfb8c47 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -167,9 +167,13 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 const struct xattr_handler *v9fs_xattr_handlers[] = {
        &v9fs_xattr_user_handler,
+        &v9fs_xattr_trusted_handler,
 #ifdef CONFIG_9P_FS_POSIX_ACL
        &v9fs_xattr_acl_access_handler,
        &v9fs_xattr_acl_default_handler,
 #endif
+#ifdef CONFIG_9P_FS_SECURITY
+        &v9fs_xattr_security_handler,
+#endif
        NULL
 };
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index eec348a3df71..d3e2ea3840be 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -20,6 +20,8 @@
 extern const struct xattr_handler *v9fs_xattr_handlers[];
 extern struct xattr_handler v9fs_xattr_user_handler;
+extern struct xattr_handler v9fs_xattr_trusted_handler;
+extern struct xattr_handler v9fs_xattr_security_handler;
 extern const struct xattr_handler v9fs_xattr_acl_access_handler;
 extern const struct xattr_handler v9fs_xattr_acl_default_handler;
diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c
new file mode 100644
index 000000000000..cb247a142a6e
--- /dev/null
+++ b/fs/9p/xattr_security.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
+                        void *buffer, size_t size, int type)
+{
+        int retval;
+        char *full_name;
+        size_t name_len;
+        size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+        if (name == NULL)
+                return -EINVAL;
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        name_len = strlen(name);
+        full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+        if (!full_name)
+                return -ENOMEM;
+        memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
+        memcpy(full_name+prefix_len, name, name_len);
+        full_name[prefix_len + name_len] = '\0';
+        retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+        kfree(full_name);
+        return retval;
+}
+static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
+                        const void *value, size_t size, int flags, int type)
+{
+        int retval;
+        char *full_name;
+        size_t name_len;
+        size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+        if (name == NULL)
+                return -EINVAL;
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        name_len = strlen(name);
+        full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+        if (!full_name)
+                return -ENOMEM;
+        memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
+        memcpy(full_name + prefix_len, name, name_len);
+        full_name[prefix_len + name_len] = '\0';
+        retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+        kfree(full_name);
+        return retval;
+}
+struct xattr_handler v9fs_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .get    = v9fs_xattr_security_get,
+        .set    = v9fs_xattr_security_set,
+};
diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c
new file mode 100644
index 000000000000..e30d33b8a3fb
--- /dev/null
+++ b/fs/9p/xattr_trusted.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
+                        void *buffer, size_t size, int type)
+{
+        int retval;
+        char *full_name;
+        size_t name_len;
+        size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+        if (name == NULL)
+                return -EINVAL;
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        name_len = strlen(name);
+        full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+        if (!full_name)
+                return -ENOMEM;
+        memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
+        memcpy(full_name+prefix_len, name, name_len);
+        full_name[prefix_len + name_len] = '\0';
+        retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+        kfree(full_name);
+        return retval;
+}
+static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
+                        const void *value, size_t size, int flags, int type)
+{
+        int retval;
+        char *full_name;
+        size_t name_len;
+        size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+        if (name == NULL)
+                return -EINVAL;
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        name_len = strlen(name);
+        full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+        if (!full_name)
+                return -ENOMEM;
+        memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
+        memcpy(full_name + prefix_len, name, name_len);
+        full_name[prefix_len + name_len] = '\0';
+        retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+        kfree(full_name);
+        return retval;
+}
+struct xattr_handler v9fs_xattr_trusted_handler = {
+        .prefix = XATTR_TRUSTED_PREFIX,
+        .get    = v9fs_xattr_trusted_get,
+        .set    = v9fs_xattr_trusted_set,
+};
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 9cf874ce8336..0d138c0de293 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -17,47 +17,43 @@
 static DEFINE_RWLOCK(adfs_dir_lock);
 static int
-adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+adfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
        struct object_info obj;
        struct adfs_dir dir;
        int ret = 0;
-        if (filp->f_pos >> 32)
+        if (ctx->pos >> 32)
-                goto out;
+                return 0;
        ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
        if (ret)
-                goto out;
+                return ret;
-        switch ((unsigned long)filp->f_pos) {
+        if (ctx->pos == 0) {
-        case 0:
+                if (!dir_emit_dot(file, ctx))
-                if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
                        goto free_out;
-                filp->f_pos += 1;
+                ctx->pos = 1;
+        }
-        case 1:
+        if (ctx->pos == 1) {
-                if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0)
+                if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR))
                        goto free_out;
-                filp->f_pos += 1;
+                ctx->pos = 2;
-        default:
-                break;
        }
        read_lock(&adfs_dir_lock);
-        ret = ops->setpos(&dir, filp->f_pos - 2);
+        ret = ops->setpos(&dir, ctx->pos - 2);
        if (ret)
                goto unlock_out;
        while (ops->getnext(&dir, &obj) == 0) {
-                if (filldir(dirent, obj.name, obj.name_len,
+                if (!dir_emit(ctx, obj.name, obj.name_len,
-                            filp->f_pos, obj.file_id, DT_UNKNOWN) < 0)
+                            obj.file_id, DT_UNKNOWN))
-                        goto unlock_out;
+                        break;
-                filp->f_pos += 1;
+                ctx->pos++;
        }
 unlock_out:
@@ -65,8 +61,6 @@ unlock_out:
 free_out:
        ops->free(&dir);
-out:
        return ret;
 }
@@ -192,13 +186,12 @@ out:
 const struct file_operations adfs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
-        .readdir        = adfs_readdir,
+        .iterate        = adfs_readdir,
        .fsync          = generic_file_fsync,
 };
 static int
-adfs_hash(const struct dentry *parent, const struct inode *inode,
+adfs_hash(const struct dentry *parent, struct qstr *qstr)
-                struct qstr *qstr)
 {
        const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
        const unsigned char *name;
@@ -234,8 +227,7 @@ adfs_hash(const struct dentry *parent, const struct inode *inode,
 * requirements of the underlying filesystem.
 */
 static int
-adfs_compare(const struct dentry *parent, const struct inode *pinode,
+adfs_compare(const struct dentry *parent, const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        int i;
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index fd11a6d608ee..f1eba8c3644e 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -15,12 +15,12 @@
 #include "affs.h"
-static int affs_readdir(struct file *, void *, filldir_t);
+static int affs_readdir(struct file *, struct dir_context *);
 const struct file_operations affs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
-        .readdir        = affs_readdir,
+        .iterate        = affs_readdir,
        .fsync          = affs_file_fsync,
 };
@@ -40,52 +40,35 @@ const struct inode_operations affs_dir_inode_operations = {
 };
 static int
-affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+affs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode            *inode = file_inode(filp);
+        struct inode            *inode = file_inode(file);
        struct super_block      *sb = inode->i_sb;
-        struct buffer_head      *dir_bh;
+        struct buffer_head      *dir_bh = NULL;
-        struct buffer_head      *fh_bh;
+        struct buffer_head      *fh_bh = NULL;
        unsigned char           *name;
        int                      namelen;
        u32                      i;
        int                      hash_pos;
        int                      chain_pos;
-        u32                      f_pos;
        u32                      ino;
-        int                      stored;
-        int                      res;
-        pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)filp->f_pos);
+        pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos);
-        stored = 0;
+        if (ctx->pos < 2) {
-        res    = -EIO;
+                file->private_data = (void *)0;
-        dir_bh = NULL;
+                if (!dir_emit_dots(file, ctx))
-        fh_bh  = NULL;
-        f_pos  = filp->f_pos;
-        if (f_pos == 0) {
-                filp->private_data = (void *)0;
-                if (filldir(dirent, ".", 1, f_pos, inode->i_ino, DT_DIR) < 0)
                        return 0;
-                filp->f_pos = f_pos = 1;
-                stored++;
-        }
-        if (f_pos == 1) {
-                if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0)
-                        return stored;
-                filp->f_pos = f_pos = 2;
-                stored++;
        }
        affs_lock_dir(inode);
-        chain_pos = (f_pos - 2) & 0xffff;
+        chain_pos = (ctx->pos - 2) & 0xffff;
-        hash_pos  = (f_pos - 2) >> 16;
+        hash_pos  = (ctx->pos - 2) >> 16;
        if (chain_pos == 0xffff) {
                affs_warning(sb, "readdir", "More than 65535 entries in chain");
                chain_pos = 0;
                hash_pos++;
-                filp->f_pos = ((hash_pos << 16) | chain_pos) + 2;
+                ctx->pos = ((hash_pos << 16) | chain_pos) + 2;
        }
        dir_bh = affs_bread(sb, inode->i_ino);
        if (!dir_bh)
@@ -94,8 +77,8 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        /* If the directory hasn't changed since the last call to readdir(),
         * we can jump directly to where we left off.
         */
-        ino = (u32)(long)filp->private_data;
+        ino = (u32)(long)file->private_data;
-        if (ino && filp->f_version == inode->i_version) {
+        if (ino && file->f_version == inode->i_version) {
                pr_debug("AFFS: readdir() left off=%d\n", ino);
                goto inside;
        }
@@ -105,7 +88,7 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                fh_bh = affs_bread(sb, ino);
                if (!fh_bh) {
                        affs_error(sb, "readdir","Cannot read block %d", i);
-                        goto readdir_out;
+                        return -EIO;
                }
                ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
                affs_brelse(fh_bh);
@@ -119,38 +102,34 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]);
                if (!ino)
                        continue;
-                f_pos = (hash_pos << 16) + 2;
+                ctx->pos = (hash_pos << 16) + 2;
 inside:
                do {
                        fh_bh = affs_bread(sb, ino);
                        if (!fh_bh) {
                                affs_error(sb, "readdir","Cannot read block %d", ino);
-                                goto readdir_done;
+                                break;
                        }
                        namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
                        name = AFFS_TAIL(sb, fh_bh)->name + 1;
                        pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n",
-                                 namelen, name, ino, hash_pos, f_pos);
+                                 namelen, name, ino, hash_pos, (u32)ctx->pos);
-                        if (filldir(dirent, name, namelen, f_pos, ino, DT_UNKNOWN) < 0)
+                        if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
                                goto readdir_done;
-                        stored++;
+                        ctx->pos++;
-                        f_pos++;
                        ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
                        affs_brelse(fh_bh);
                        fh_bh = NULL;
                } while (ino);
        }
 readdir_done:
-        filp->f_pos = f_pos;
+        file->f_version = inode->i_version;
-        filp->f_version = inode->i_version;
+        file->private_data = (void *)(long)ino;
-        filp->private_data = (void *)(long)ino;
-        res = stored;
 readdir_out:
        affs_brelse(dir_bh);
        affs_brelse(fh_bh);
        affs_unlock_dir(inode);
-        pr_debug("AFFS: readdir()=%d\n", stored);
+        return 0;
-        return res;
 }
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index ff65884a7839..c36cbb4537a2 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,18 +13,12 @@
 typedef int (*toupper_t)(int);
 static int       affs_toupper(int ch);
-static int       affs_hash_dentry(const struct dentry *,
+static int       affs_hash_dentry(const struct dentry *, struct qstr *);
-                const struct inode *, struct qstr *);
+static int       affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
-static int       affs_compare_dentry(const struct dentry *parent,
-                const struct inode *pinode,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name);
 static int       affs_intl_toupper(int ch);
-static int       affs_intl_hash_dentry(const struct dentry *,
+static int       affs_intl_hash_dentry(const struct dentry *, struct qstr *);
-                const struct inode *, struct qstr *);
+static int       affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
-static int       affs_intl_compare_dentry(const struct dentry *parent,
-                const struct inode *pinode,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name);
 const struct dentry_operations affs_dentry_operations = {
@@ -86,14 +80,12 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
 }
 static int
-affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
-                struct qstr *qstr)
 {
        return __affs_hash_dentry(qstr, affs_toupper);
 }
 static int
-affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
-                struct qstr *qstr)
 {
        return __affs_hash_dentry(qstr, affs_intl_toupper);
 }
@@ -131,15 +123,13 @@ static inline int __affs_compare_dentry(unsigned int len,
 }
 static int
-affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        return __affs_compare_dentry(len, str, name, affs_toupper);
 }
 static int
-affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
+affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        return __affs_compare_dentry(len, str, name, affs_intl_toupper);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 7a465ed04444..34494fbead0a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -22,7 +22,7 @@
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
                                 unsigned int flags);
 static int afs_dir_open(struct inode *inode, struct file *file);
-static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
+static int afs_readdir(struct file *file, struct dir_context *ctx);
 static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
 static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_release(struct dentry *dentry);
@@ -43,7 +43,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 const struct file_operations afs_dir_file_operations = {
        .open           = afs_dir_open,
        .release        = afs_release,
-        .readdir        = afs_readdir,
+        .iterate        = afs_readdir,
        .lock           = afs_lock,
        .llseek         = generic_file_llseek,
 };
@@ -119,9 +119,9 @@ struct afs_dir_page {
 };
 struct afs_lookup_cookie {
+        struct dir_context ctx;
        struct afs_fid  fid;
-        const char      *name;
+        struct qstr name;
-        size_t          nlen;
        int             found;
 };
@@ -228,20 +228,18 @@ static int afs_dir_open(struct inode *inode, struct file *file)
 /*
 * deal with one block in an AFS directory
 */
-static int afs_dir_iterate_block(unsigned *fpos,
+static int afs_dir_iterate_block(struct dir_context *ctx,
                                 union afs_dir_block *block,
-                                 unsigned blkoff,
+                                 unsigned blkoff)
-                                 void *cookie,
-                                 filldir_t filldir)
 {
        union afs_dirent *dire;
        unsigned offset, next, curr;
        size_t nlen;
-        int tmp, ret;
+        int tmp;
-        _enter("%u,%x,%p,,",*fpos,blkoff,block);
+        _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block);
-        curr = (*fpos - blkoff) / sizeof(union afs_dirent);
+        curr = (ctx->pos - blkoff) / sizeof(union afs_dirent);
        /* walk through the block, an entry at a time */
        for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries;
@@ -256,7 +254,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
                        _debug("ENT[%Zu.%u]: unused",
                               blkoff / sizeof(union afs_dir_block), offset);
                        if (offset >= curr)
-                                *fpos = blkoff +
+                                ctx->pos = blkoff +
                                        next * sizeof(union afs_dirent);
                        continue;
                }
@@ -302,19 +300,15 @@ static int afs_dir_iterate_block(unsigned *fpos,
                        continue;
                /* found the next entry */
-                ret = filldir(cookie,
+                if (!dir_emit(ctx, dire->u.name, nlen,
-                              dire->u.name,
-                              nlen,
-                              blkoff + offset * sizeof(union afs_dirent),
                              ntohl(dire->u.vnode),
-                              filldir == afs_lookup_filldir ?
+                              ctx->actor == afs_lookup_filldir ?
-                              ntohl(dire->u.unique) : DT_UNKNOWN);
+                              ntohl(dire->u.unique) : DT_UNKNOWN)) {
-                if (ret < 0) {
                        _leave(" = 0 [full]");
                        return 0;
                }
-                *fpos = blkoff + next * sizeof(union afs_dirent);
+                ctx->pos = blkoff + next * sizeof(union afs_dirent);
        }
        _leave(" = 1 [more]");
@@ -324,8 +318,8 @@ static int afs_dir_iterate_block(unsigned *fpos,
 /*
 * iterate through the data blob that lists the contents of an AFS directory
 */
-static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
+static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
-                           filldir_t filldir, struct key *key)
+                           struct key *key)
 {
        union afs_dir_block *dblock;
        struct afs_dir_page *dbuf;
@@ -333,7 +327,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
        unsigned blkoff, limit;
        int ret;
-        _enter("{%lu},%u,,", dir->i_ino, *fpos);
+        _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
        if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
                _leave(" = -ESTALE");
@@ -341,13 +335,13 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
        }
        /* round the file position up to the next entry boundary */
-        *fpos += sizeof(union afs_dirent) - 1;
+        ctx->pos += sizeof(union afs_dirent) - 1;
-        *fpos &= ~(sizeof(union afs_dirent) - 1);
+        ctx->pos &= ~(sizeof(union afs_dirent) - 1);
        /* walk through the blocks in sequence */
        ret = 0;
-        while (*fpos < dir->i_size) {
+        while (ctx->pos < dir->i_size) {
-                blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1);
+                blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1);
                /* fetch the appropriate page from the directory */
                page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
@@ -364,8 +358,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
                do {
                        dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
                                               sizeof(union afs_dir_block)];
-                        ret = afs_dir_iterate_block(fpos, dblock, blkoff,
+                        ret = afs_dir_iterate_block(ctx, dblock, blkoff);
-                                                    cookie, filldir);
                        if (ret != 1) {
                                afs_dir_put_page(page);
                                goto out;
@@ -373,7 +366,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
                        blkoff += sizeof(union afs_dir_block);
-                } while (*fpos < dir->i_size && blkoff < limit);
+                } while (ctx->pos < dir->i_size && blkoff < limit);
                afs_dir_put_page(page);
                ret = 0;
@@ -387,23 +380,10 @@ out:
 /*
 * read an AFS directory
 */
-static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
+static int afs_readdir(struct file *file, struct dir_context *ctx)
 {
-        unsigned fpos;
+        return afs_dir_iterate(file_inode(file), 
-        int ret;
+                              ctx, file->private_data);
-        _enter("{%Ld,{%lu}}",
-               file->f_pos, file_inode(file)->i_ino);
-        ASSERT(file->private_data != NULL);
-        fpos = file->f_pos;
-        ret = afs_dir_iterate(file_inode(file), &fpos,
-                              cookie, filldir, file->private_data);
-        file->f_pos = fpos;
-        _leave(" = %d", ret);
-        return ret;
 }
 /*
@@ -416,15 +396,16 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
 {
        struct afs_lookup_cookie *cookie = _cookie;
-        _enter("{%s,%Zu},%s,%u,,%llu,%u",
+        _enter("{%s,%u},%s,%u,,%llu,%u",
-               cookie->name, cookie->nlen, name, nlen,
+               cookie->name.name, cookie->name.len, name, nlen,
               (unsigned long long) ino, dtype);
        /* insanity checks first */
        BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
        BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
-        if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) {
+        if (cookie->name.len != nlen ||
+            memcmp(cookie->name.name, name, nlen) != 0) {
                _leave(" = 0 [no]");
                return 0;
        }
@@ -444,24 +425,18 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
 static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
                         struct afs_fid *fid, struct key *key)
 {
-        struct afs_lookup_cookie cookie;
+        struct afs_super_info *as = dir->i_sb->s_fs_info;
-        struct afs_super_info *as;
+        struct afs_lookup_cookie cookie = {
-        unsigned fpos;
+                .ctx.actor = afs_lookup_filldir,
+                .name = dentry->d_name,
+                .fid.vid = as->volume->vid
+        };
        int ret;
        _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
-        as = dir->i_sb->s_fs_info;
        /* search the directory */
-        cookie.name     = dentry->d_name.name;
+        ret = afs_dir_iterate(dir, &cookie.ctx, key);
-        cookie.nlen     = dentry->d_name.len;
-        cookie.fid.vid  = as->volume->vid;
-        cookie.found    = 0;
-        fpos = 0;
-        ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
-                              key);
        if (ret < 0) {
                _leave(" = %d [iter]", ret);
                return ret;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 8f6e9234d565..66d50fe2ee45 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,8 @@
 #include "internal.h"
 static int afs_readpage(struct file *file, struct page *page);
-static void afs_invalidatepage(struct page *page, unsigned long offset);
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+                               unsigned int length);
 static int afs_releasepage(struct page *page, gfp_t gfp_flags);
 static int afs_launder_page(struct page *page);
@@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page)
 * - release a page and clean up its private data if offset is 0 (indicating
 *   the entire page)
 */
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+                               unsigned int length)
 {
        struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
-        _enter("{%lu},%lu", page->index, offset);
+        _enter("{%lu},%u,%u", page->index, offset, length);
        BUG_ON(!PageLocked(page));
        /* we clean up only if the entire page is being invalidated */
-        if (offset == 0) {
+        if (offset == 0 && length == PAGE_CACHE_SIZE) {
 #ifdef CONFIG_AFS_FSCACHE
                if (PageFsCache(page)) {
                        struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 2497bf306c70..a8cf2cff836c 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -252,7 +252,8 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
 */
 static int afs_do_setlk(struct file *file, struct file_lock *fl)
 {
-        struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+        struct inode *inode = file_inode(file);
+        struct afs_vnode *vnode = AFS_FS_I(inode);
        afs_lock_type_t type;
        struct key *key = file->private_data;
        int ret;
@@ -273,7 +274,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
        type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        /* make sure we've got a callback on this file and that our view of the
         * data version is up to date */
@@ -420,7 +421,7 @@ given_lock:
        afs_vnode_fetch_status(vnode, NULL, key);
 error:
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        _leave(" = %d", ret);
        return ret;
diff --git a/fs/aio.c b/fs/aio.c
index 2bbcacf74d0c..9b5ca1137419 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -39,6 +39,8 @@
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
+#include "internal.h"
 #define AIO_RING_MAGIC                  0xa10a10a1
 #define AIO_RING_COMPAT_FEATURES        1
 #define AIO_RING_INCOMPAT_FEATURES      0
@@ -623,7 +625,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        /*
         * Add a completion event to the ring buffer. Must be done holding
-         * ctx->ctx_lock to prevent other code from messing with the tail
+         * ctx->completion_lock to prevent other code from messing with the tail
         * pointer since we might be called from irq context.
         */
        spin_lock_irqsave(&ctx->completion_lock, flags);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 13ddec92341c..3d9d3f5d5dda 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -109,7 +109,7 @@ cont:
        spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
        /* Already gone or negative dentry (under construction) - try next */
-        if (q->d_count == 0 || !simple_positive(q)) {
+        if (!d_count(q) || !simple_positive(q)) {
                spin_unlock(&q->d_lock);
                next = q->d_u.d_child.next;
                goto cont;
@@ -267,7 +267,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
                        else
                                ino_count++;
-                        if (p->d_count > ino_count) {
+                        if (d_count(p) > ino_count) {
                                top_ino->last_used = jiffies;
                                dput(p);
                                return 1;
@@ -409,7 +409,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                if (!exp_leaves) {
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 1;
-                        if (dentry->d_count > ino_count)
+                        if (d_count(dentry) > ino_count)
                                goto next;
                        if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -423,7 +423,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                } else {
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 1;
-                        if (dentry->d_count > ino_count)
+                        if (d_count(dentry) > ino_count)
                                goto next;
                        expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 085da86e07c2..92ef341ba0cf 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -41,7 +41,7 @@ const struct file_operations autofs4_root_operations = {
        .open           = dcache_dir_open,
        .release        = dcache_dir_close,
        .read           = generic_read_dir,
-        .readdir        = dcache_readdir,
+        .iterate        = dcache_readdir,
        .llseek         = dcache_dir_lseek,
        .unlocked_ioctl = autofs4_root_ioctl,
 #ifdef CONFIG_COMPAT
@@ -53,7 +53,7 @@ const struct file_operations autofs4_dir_operations = {
        .open           = autofs4_dir_open,
        .release        = dcache_dir_close,
        .read           = generic_read_dir,
-        .readdir        = dcache_readdir,
+        .iterate        = dcache_readdir,
        .llseek         = dcache_dir_lseek,
 };
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
                spin_lock(&active->d_lock);
                /* Already gone? */
-                if (active->d_count == 0)
+                if (!d_count(active))
                        goto next;
                qstr = &active->d_name;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 922ad460bff9..7c93953030fb 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -45,7 +45,7 @@ static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        return -EIO;
 }
-static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int bad_file_readdir(struct file *file, struct dir_context *ctx)
 {
        return -EIO;
 }
@@ -152,7 +152,7 @@ static const struct file_operations bad_file_ops =
        .write          = bad_file_write,
        .aio_read       = bad_file_aio_read,
        .aio_write      = bad_file_aio_write,
-        .readdir        = bad_file_readdir,
+        .iterate        = bad_file_readdir,
        .poll           = bad_file_poll,
        .unlocked_ioctl = bad_file_unlocked_ioctl,
        .compat_ioctl   = bad_file_compat_ioctl,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index f95dddced968..e9c75e20db32 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -31,7 +31,7 @@ MODULE_LICENSE("GPL");
 /* The units the vfs expects inode->i_blocks to be in */
 #define VFS_BLOCK_SIZE 512
-static int befs_readdir(struct file *, void *, filldir_t);
+static int befs_readdir(struct file *, struct dir_context *);
 static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 static int befs_readpage(struct file *file, struct page *page);
 static sector_t befs_bmap(struct address_space *mapping, sector_t block);
@@ -66,7 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
 static const struct file_operations befs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = befs_readdir,
+        .iterate        = befs_readdir,
        .llseek         = generic_file_llseek,
 };
@@ -211,9 +211,9 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 }
 static int
-befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+befs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
        befs_off_t value;
@@ -221,15 +221,14 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        size_t keysize;
        unsigned char d_type;
        char keybuf[BEFS_NAME_LEN + 1];
-        char *nlsname;
+        const char *dirname = file->f_path.dentry->d_name.name;
-        int nlsnamelen;
-        const char *dirname = filp->f_path.dentry->d_name.name;
        befs_debug(sb, "---> befs_readdir() "
-                   "name %s, inode %ld, filp->f_pos %Ld",
+                   "name %s, inode %ld, ctx->pos %Ld",
-                   dirname, inode->i_ino, filp->f_pos);
+                   dirname, inode->i_ino, ctx->pos);
-        result = befs_btree_read(sb, ds, filp->f_pos, BEFS_NAME_LEN + 1,
+more:
+        result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
                                 keybuf, &keysize, &value);
        if (result == BEFS_ERR) {
@@ -251,24 +250,29 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        /* Convert to NLS */
        if (BEFS_SB(sb)->nls) {
+                char *nlsname;
+                int nlsnamelen;
                result =
                    befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
                if (result < 0) {
                        befs_debug(sb, "<--- befs_readdir() ERROR");
                        return result;
                }
-                result = filldir(dirent, nlsname, nlsnamelen, filp->f_pos,
+                if (!dir_emit(ctx, nlsname, nlsnamelen,
-                                 (ino_t) value, d_type);
+                                 (ino_t) value, d_type)) {
+                        kfree(nlsname);
+                        return 0;
+                }
                kfree(nlsname);
        } else {
-                result = filldir(dirent, keybuf, keysize, filp->f_pos,
+                if (!dir_emit(ctx, keybuf, keysize,
-                                 (ino_t) value, d_type);
+                                 (ino_t) value, d_type))
+                        return 0;
        }
-        if (!result)
+        ctx->pos++;
-                filp->f_pos++;
+        goto more;
-        befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos);
+        befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos);
        return 0;
 }
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 3f422f6bb5ca..a399e6d9dc74 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -26,58 +26,51 @@ static struct buffer_head *bfs_find_entry(struct inode *dir,
                                const unsigned char *name, int namelen,
                                struct bfs_dirent **res_dir);
-static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
+static int bfs_readdir(struct file *f, struct dir_context *ctx)
 {
        struct inode *dir = file_inode(f);
        struct buffer_head *bh;
        struct bfs_dirent *de;
-        struct bfs_sb_info *info = BFS_SB(dir->i_sb);
        unsigned int offset;
        int block;
-        mutex_lock(&info->bfs_lock);
+        if (ctx->pos & (BFS_DIRENT_SIZE - 1)) {
-        if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
                printf("Bad f_pos=%08lx for %s:%08lx\n",
-                                        (unsigned long)f->f_pos,
+                                        (unsigned long)ctx->pos,
                                        dir->i_sb->s_id, dir->i_ino);
-                mutex_unlock(&info->bfs_lock);
+                return -EINVAL;
-                return -EBADF;
        }
-        while (f->f_pos < dir->i_size) {
+        while (ctx->pos < dir->i_size) {
-                offset = f->f_pos & (BFS_BSIZE - 1);
+                offset = ctx->pos & (BFS_BSIZE - 1);
-                block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS);
+                block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS);
                bh = sb_bread(dir->i_sb, block);
                if (!bh) {
-                        f->f_pos += BFS_BSIZE - offset;
+                        ctx->pos += BFS_BSIZE - offset;
                        continue;
                }
                do {
                        de = (struct bfs_dirent *)(bh->b_data + offset);
                        if (de->ino) {
                                int size = strnlen(de->name, BFS_NAMELEN);
-                                if (filldir(dirent, de->name, size, f->f_pos,
+                                if (!dir_emit(ctx, de->name, size,
                                                le16_to_cpu(de->ino),
-                                                DT_UNKNOWN) < 0) {
+                                                DT_UNKNOWN)) {
                                        brelse(bh);
-                                        mutex_unlock(&info->bfs_lock);
                                        return 0;
                                }
                        }
                        offset += BFS_DIRENT_SIZE;
-                        f->f_pos += BFS_DIRENT_SIZE;
+                        ctx->pos += BFS_DIRENT_SIZE;
-                } while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size));
+                } while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size));
                brelse(bh);
        }
+        return 0;
-        mutex_unlock(&info->bfs_lock);
-        return 0;       
 }
 const struct file_operations bfs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = bfs_readdir,
+        .iterate        = bfs_readdir,
        .fsync          = generic_file_fsync,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index bce87694f7b0..89dec7f789a4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -255,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
                (current->mm->start_data = N_DATADDR(ex));
        current->mm->brk = ex.a_bss +
                (current->mm->start_brk = N_BSSADDR(ex));
-        current->mm->free_area_cache = current->mm->mmap_base;
-        current->mm->cached_hole_size = 0;
        retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
        if (retval < 0) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f8a0b0efda44..100edcc5e312 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -738,8 +738,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
        /* Do this so that we can load the interpreter, if need be.  We will
           change some of these later */
-        current->mm->free_area_cache = current->mm->mmap_base;
-        current->mm->cached_hole_size = 0;
        retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
                                 executable_stack);
        if (retval < 0) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2091db8cdd78..c7bda5cd3da7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -58,17 +58,24 @@ static void bdev_inode_switch_bdi(struct inode *inode,
                        struct backing_dev_info *dst)
 {
        struct backing_dev_info *old = inode->i_data.backing_dev_info;
+        bool wakeup_bdi = false;
        if (unlikely(dst == old))               /* deadlock avoidance */
                return;
        bdi_lock_two(&old->wb, &dst->wb);
        spin_lock(&inode->i_lock);
        inode->i_data.backing_dev_info = dst;
-        if (inode->i_state & I_DIRTY)
+        if (inode->i_state & I_DIRTY) {
+                if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
+                        wakeup_bdi = true;
                list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+        }
        spin_unlock(&inode->i_lock);
        spin_unlock(&old->wb.list_lock);
        spin_unlock(&dst->wb.list_lock);
+        if (wakeup_bdi)
+                bdi_wakeup_thread_delayed(dst);
 }
 /* Kill _all_ buffers and pagecache , dirty or not.. */
@@ -325,31 +332,10 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
 static loff_t block_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *bd_inode = file->f_mapping->host;
-        loff_t size;
        loff_t retval;
        mutex_lock(&bd_inode->i_mutex);
-        size = i_size_read(bd_inode);
+        retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
-        retval = -EINVAL;
-        switch (whence) {
-                case SEEK_END:
-                        offset += size;
-                        break;
-                case SEEK_CUR:
-                        offset += file->f_pos;
-                case SEEK_SET:
-                        break;
-                default:
-                        goto out;
-        }
-        if (offset >= 0 && offset <= size) {
-                if (offset != file->f_pos) {
-                        file->f_pos = offset;
-                }
-                retval = offset;
-        }
-out:
        mutex_unlock(&bd_inode->i_mutex);
        return retval;
 }
@@ -1583,6 +1569,7 @@ static const struct address_space_operations def_blk_aops = {
        .writepages     = generic_writepages,
        .releasepage    = blkdev_releasepage,
        .direct_IO      = blkdev_direct_IO,
+        .is_dirty_writeback = buffer_check_dirty_writeback,
 };
 const struct file_operations def_blk_fops = {
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 290e347b6db3..8bc5e8ccb091 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -36,16 +36,23 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
                                u64 extent_item_pos,
                                struct extent_inode_elem **eie)
 {
-        u64 data_offset;
+        u64 offset = 0;
-        u64 data_len;
        struct extent_inode_elem *e;
-        data_offset = btrfs_file_extent_offset(eb, fi);
+        if (!btrfs_file_extent_compression(eb, fi) &&
-        data_len = btrfs_file_extent_num_bytes(eb, fi);
+            !btrfs_file_extent_encryption(eb, fi) &&
+            !btrfs_file_extent_other_encoding(eb, fi)) {
+                u64 data_offset;
+                u64 data_len;
-        if (extent_item_pos < data_offset ||
+                data_offset = btrfs_file_extent_offset(eb, fi);
-            extent_item_pos >= data_offset + data_len)
+                data_len = btrfs_file_extent_num_bytes(eb, fi);
-                return 1;
+                if (extent_item_pos < data_offset ||
+                    extent_item_pos >= data_offset + data_len)
+                        return 1;
+                offset = extent_item_pos - data_offset;
+        }
        e = kmalloc(sizeof(*e), GFP_NOFS);
        if (!e)
@@ -53,7 +60,7 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
        e->next = *eie;
        e->inum = key->objectid;
-        e->offset = key->offset + (extent_item_pos - data_offset);
+        e->offset = key->offset + offset;
        *eie = e;
        return 0;
@@ -189,7 +196,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
        struct extent_buffer *eb;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
-        struct extent_inode_elem *eie = NULL;
+        struct extent_inode_elem *eie = NULL, *old = NULL;
        u64 disk_byte;
        if (level != 0) {
@@ -223,6 +230,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
                if (disk_byte == wanted_disk_byte) {
                        eie = NULL;
+                        old = NULL;
                        if (extent_item_pos) {
                                ret = check_extent_in_eb(&key, eb, fi,
                                                *extent_item_pos,
@@ -230,18 +238,20 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
                                if (ret < 0)
                                        break;
                        }
-                        if (!ret) {
+                        if (ret > 0)
-                                ret = ulist_add(parents, eb->start,
+                                goto next;
-                                                (uintptr_t)eie, GFP_NOFS);
+                        ret = ulist_add_merge(parents, eb->start,
-                                if (ret < 0)
+                                              (uintptr_t)eie,
-                                        break;
+                                              (u64 *)&old, GFP_NOFS);
-                                if (!extent_item_pos) {
+                        if (ret < 0)
-                                        ret = btrfs_next_old_leaf(root, path,
+                                break;
-                                                        time_seq);
+                        if (!ret && extent_item_pos) {
-                                        continue;
+                                while (old->next)
-                                }
+                                        old = old->next;
+                                old->next = eie;
                        }
                }
+next:
                ret = btrfs_next_old_item(root, path, time_seq);
        }
@@ -255,13 +265,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 * to a logical address
 */
 static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
-                                        int search_commit_root,
+                                  struct btrfs_path *path, u64 time_seq,
-                                        u64 time_seq,
+                                  struct __prelim_ref *ref,
-                                        struct __prelim_ref *ref,
+                                  struct ulist *parents,
-                                        struct ulist *parents,
+                                  const u64 *extent_item_pos)
-                                        const u64 *extent_item_pos)
 {
-        struct btrfs_path *path;
        struct btrfs_root *root;
        struct btrfs_key root_key;
        struct extent_buffer *eb;
@@ -269,11 +277,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
        int root_level;
        int level = ref->level;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        path->search_commit_root = !!search_commit_root;
        root_key.objectid = ref->root_id;
        root_key.type = BTRFS_ROOT_ITEM_KEY;
        root_key.offset = (u64)-1;
@@ -314,7 +317,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                                time_seq, ref->wanted_disk_byte,
                                extent_item_pos);
 out:
-        btrfs_free_path(path);
+        path->lowest_level = 0;
+        btrfs_release_path(path);
        return ret;
 }
@@ -322,7 +326,7 @@ out:
 * resolve all indirect backrefs from the list
 */
 static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
-                                   int search_commit_root, u64 time_seq,
+                                   struct btrfs_path *path, u64 time_seq,
                                   struct list_head *head,
                                   const u64 *extent_item_pos)
 {
@@ -349,9 +353,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                        continue;
                if (ref->count == 0)
                        continue;
-                err = __resolve_indirect_ref(fs_info, search_commit_root,
+                err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
-                                             time_seq, ref, parents,
+                                             parents, extent_item_pos);
-                                             extent_item_pos);
                if (err == -ENOMEM)
                        goto out;
                if (err)
@@ -604,6 +607,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
        int slot;
        struct extent_buffer *leaf;
        struct btrfs_key key;
+        struct btrfs_key found_key;
        unsigned long ptr;
        unsigned long end;
        struct btrfs_extent_item *ei;
@@ -621,17 +625,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
        flags = btrfs_extent_flags(leaf, ei);
+        btrfs_item_key_to_cpu(leaf, &found_key, slot);
        ptr = (unsigned long)(ei + 1);
        end = (unsigned long)ei + item_size;
-        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+        if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+            flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                struct btrfs_tree_block_info *info;
                info = (struct btrfs_tree_block_info *)ptr;
                *info_level = btrfs_tree_block_level(leaf, info);
                ptr += sizeof(struct btrfs_tree_block_info);
                BUG_ON(ptr > end);
+        } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
+                *info_level = found_key.offset;
        } else {
                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
        }
@@ -795,7 +803,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_head *head;
        int info_level = 0;
        int ret;
-        int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
        struct list_head prefs_delayed;
        struct list_head prefs;
        struct __prelim_ref *ref;
@@ -804,13 +811,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
        INIT_LIST_HEAD(&prefs_delayed);
        key.objectid = bytenr;
-        key.type = BTRFS_EXTENT_ITEM_KEY;
        key.offset = (u64)-1;
+        if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+                key.type = BTRFS_METADATA_ITEM_KEY;
+        else
+                key.type = BTRFS_EXTENT_ITEM_KEY;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        path->search_commit_root = !!search_commit_root;
+        if (!trans)
+                path->search_commit_root = 1;
        /*
         * grab both a lock on the path and a lock on the delayed ref head.
@@ -825,7 +836,7 @@ again:
                goto out;
        BUG_ON(ret == 0);
-        if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) {
+        if (trans) {
                /*
                 * look if there are updates for this ref queued and lock the
                 * head
@@ -869,7 +880,8 @@ again:
                slot = path->slots[0];
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid == bytenr &&
-                    key.type == BTRFS_EXTENT_ITEM_KEY) {
+                    (key.type == BTRFS_EXTENT_ITEM_KEY ||
+                     key.type == BTRFS_METADATA_ITEM_KEY)) {
                        ret = __add_inline_refs(fs_info, path, bytenr,
                                                &info_level, &prefs);
                        if (ret)
@@ -890,8 +902,8 @@ again:
        __merge_refs(&prefs, 1);
-        ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
+        ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
-                                      &prefs, extent_item_pos);
+                                      extent_item_pos);
        if (ret)
                goto out;
@@ -1283,12 +1295,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 {
        int ret;
        u64 flags;
+        u64 size = 0;
        u32 item_size;
        struct extent_buffer *eb;
        struct btrfs_extent_item *ei;
        struct btrfs_key key;
-        key.type = BTRFS_EXTENT_ITEM_KEY;
+        if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+                key.type = BTRFS_METADATA_ITEM_KEY;
+        else
+                key.type = BTRFS_EXTENT_ITEM_KEY;
        key.objectid = logical;
        key.offset = (u64)-1;
@@ -1301,9 +1317,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
                return ret;
        btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
-        if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
+        if (found_key->type == BTRFS_METADATA_ITEM_KEY)
+                size = fs_info->extent_root->leafsize;
+        else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
+                size = found_key->offset;
+        if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
+             found_key->type != BTRFS_METADATA_ITEM_KEY) ||
            found_key->objectid > logical ||
-            found_key->objectid + found_key->offset <= logical) {
+            found_key->objectid + size <= logical) {
                pr_debug("logical %llu is not within any extent\n",
                         (unsigned long long)logical);
                return -ENOENT;
@@ -1459,7 +1481,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
                                iterate_extent_inodes_t *iterate, void *ctx)
 {
        int ret;
-        struct btrfs_trans_handle *trans;
+        struct btrfs_trans_handle *trans = NULL;
        struct ulist *refs = NULL;
        struct ulist *roots = NULL;
        struct ulist_node *ref_node = NULL;
@@ -1471,9 +1493,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
        pr_debug("resolving all inodes for extent %llu\n",
                        extent_item_objectid);
-        if (search_commit_root) {
+        if (!search_commit_root) {
-                trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
-        } else {
                trans = btrfs_join_transaction(fs_info->extent_root);
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0f446d7ca2c0..8f2e76702932 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -23,8 +23,6 @@
 #include "ulist.h"
 #include "extent_io.h"
-#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
 struct inode_fs_paths {
        struct btrfs_path               *btrfs_path;
        struct btrfs_root               *fs_root;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 02fae7f7e42c..ed504607d8ec 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1089,7 +1089,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
-                tree_mod_log_free_eb(root->fs_info, buf);
+                if (last_ref)
+                        tree_mod_log_free_eb(root->fs_info, buf);
                btrfs_free_tree_block(trans, root, buf, parent_start,
                                      last_ref);
        }
@@ -1161,8 +1162,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
 * time_seq).
 */
 static void
-__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
+__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
-                      struct tree_mod_elem *first_tm)
+                      u64 time_seq, struct tree_mod_elem *first_tm)
 {
        u32 n;
        struct rb_node *next;
@@ -1172,6 +1173,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
        unsigned long p_size = sizeof(struct btrfs_key_ptr);
        n = btrfs_header_nritems(eb);
+        tree_mod_log_read_lock(fs_info);
        while (tm && tm->seq >= time_seq) {
                /*
                 * all the operations are recorded with the operator used for
@@ -1226,6 +1228,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
                if (tm->index != first_tm->index)
                        break;
        }
+        tree_mod_log_read_unlock(fs_info);
        btrfs_set_header_nritems(eb, n);
 }
@@ -1268,13 +1271,12 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
                BUG_ON(!eb_rewin);
        }
-        extent_buffer_get(eb_rewin);
        btrfs_tree_read_unlock(eb);
        free_extent_buffer(eb);
        extent_buffer_get(eb_rewin);
        btrfs_tree_read_lock(eb_rewin);
-        __tree_mod_log_rewind(eb_rewin, time_seq, tm);
+        __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
        WARN_ON(btrfs_header_nritems(eb_rewin) >
                BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
@@ -1350,7 +1352,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
                btrfs_set_header_generation(eb, old_generation);
        }
        if (tm)
-                __tree_mod_log_rewind(eb, time_seq, tm);
+                __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
        else
                WARN_ON(btrfs_header_level(eb) != 0);
        WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
@@ -2178,12 +2180,8 @@ static void reada_for_search(struct btrfs_root *root,
        }
 }
-/*
+static noinline void reada_for_balance(struct btrfs_root *root,
- * returns -EAGAIN if it had to drop the path, or zero if everything was in
+                                       struct btrfs_path *path, int level)
- * cache
- */
-static noinline int reada_for_balance(struct btrfs_root *root,
-                                      struct btrfs_path *path, int level)
 {
        int slot;
        int nritems;
@@ -2192,12 +2190,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
        u64 gen;
        u64 block1 = 0;
        u64 block2 = 0;
-        int ret = 0;
        int blocksize;
        parent = path->nodes[level + 1];
        if (!parent)
-                return 0;
+                return;
        nritems = btrfs_header_nritems(parent);
        slot = path->slots[level + 1];
@@ -2224,28 +2221,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                        block2 = 0;
                free_extent_buffer(eb);
        }
-        if (block1 || block2) {
-                ret = -EAGAIN;
-                /* release the whole path */
-                btrfs_release_path(path);
-                /* read the blocks */
+        if (block1)
-                if (block1)
+                readahead_tree_block(root, block1, blocksize, 0);
-                        readahead_tree_block(root, block1, blocksize, 0);
+        if (block2)
-                if (block2)
+                readahead_tree_block(root, block2, blocksize, 0);
-                        readahead_tree_block(root, block2, blocksize, 0);
-                if (block1) {
-                        eb = read_tree_block(root, block1, blocksize, 0);
-                        free_extent_buffer(eb);
-                }
-                if (block2) {
-                        eb = read_tree_block(root, block2, blocksize, 0);
-                        free_extent_buffer(eb);
-                }
-        }
-        return ret;
 }
@@ -2359,35 +2339,28 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        tmp = btrfs_find_tree_block(root, blocknr, blocksize);
        if (tmp) {
                /* first we do an atomic uptodate check */
-                if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
+                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
-                        if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+                        *eb_ret = tmp;
-                                /*
+                        return 0;
-                                 * we found an up to date block without
+                }
-                                 * sleeping, return
-                                 * right away
-                                 */
-                                *eb_ret = tmp;
-                                return 0;
-                        }
-                        /* the pages were up to date, but we failed
-                         * the generation number check.  Do a full
-                         * read for the generation number that is correct.
-                         * We must do this without dropping locks so
-                         * we can trust our generation number
-                         */
-                        free_extent_buffer(tmp);
-                        btrfs_set_path_blocking(p);
-                        /* now we're allowed to do a blocking uptodate check */
+                /* the pages were up to date, but we failed
-                        tmp = read_tree_block(root, blocknr, blocksize, gen);
+                 * the generation number check.  Do a full
-                        if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
+                 * read for the generation number that is correct.
-                                *eb_ret = tmp;
+                 * We must do this without dropping locks so
-                                return 0;
+                 * we can trust our generation number
-                        }
+                 */
-                        free_extent_buffer(tmp);
+                btrfs_set_path_blocking(p);
-                        btrfs_release_path(p);
-                        return -EIO;
+                /* now we're allowed to do a blocking uptodate check */
+                ret = btrfs_read_buffer(tmp, gen);
+                if (!ret) {
+                        *eb_ret = tmp;
+                        return 0;
                }
+                free_extent_buffer(tmp);
+                btrfs_release_path(p);
+                return -EIO;
        }
        /*
@@ -2448,11 +2421,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                        goto again;
                }
-                sret = reada_for_balance(root, p, level);
-                if (sret)
-                        goto again;
                btrfs_set_path_blocking(p);
+                reada_for_balance(root, p, level);
                sret = split_node(trans, root, p, level);
                btrfs_clear_path_blocking(p, NULL, 0);
@@ -2472,11 +2442,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                        goto again;
                }
-                sret = reada_for_balance(root, p, level);
-                if (sret)
-                        goto again;
                btrfs_set_path_blocking(p);
+                reada_for_balance(root, p, level);
                sret = balance_level(trans, root, p, level);
                btrfs_clear_path_blocking(p, NULL, 0);
@@ -3143,7 +3110,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 */
 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
-                           struct btrfs_path *path, int level, int log_removal)
+                           struct btrfs_path *path, int level)
 {
        u64 lower_gen;
        struct extent_buffer *lower;
@@ -3194,7 +3161,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(c);
        old = root->node;
-        tree_mod_log_set_root_pointer(root, c, log_removal);
+        tree_mod_log_set_root_pointer(root, c, 0);
        rcu_assign_pointer(root->node, c);
        /* the super has an extra ref to root->node */
@@ -3278,14 +3245,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                /*
                 * trying to split the root, lets make a new one
                 *
-                 * tree mod log: We pass 0 as log_removal parameter to
+                 * tree mod log: We don't log_removal old root in
                 * insert_new_root, because that root buffer will be kept as a
                 * normal node. We are going to log removal of half of the
                 * elements below with tree_mod_log_eb_copy. We're holding a
                 * tree lock on the buffer, which is why we cannot race with
                 * other tree_mod_log users.
                 */
-                ret = insert_new_root(trans, root, path, level + 1, 0);
+                ret = insert_new_root(trans, root, path, level + 1);
                if (ret)
                        return ret;
        } else {
@@ -3986,7 +3953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
                return -EOVERFLOW;
        /* first try to make some room by pushing left and right */
-        if (data_size) {
+        if (data_size && path->nodes[1]) {
                wret = push_leaf_right(trans, root, path, data_size,
                                       data_size, 0, 0);
                if (wret < 0)
@@ -4005,7 +3972,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        }
        if (!path->nodes[1]) {
-                ret = insert_new_root(trans, root, path, 1, 1);
+                ret = insert_new_root(trans, root, path, 1);
                if (ret)
                        return ret;
        }
@@ -4430,7 +4397,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
 }
 /*
- * make the item pointed to by the path bigger, data_size is the new size.
+ * make the item pointed to by the path bigger, data_size is the added size.
 */
 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
                       u32 data_size)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d6dd49b51ba8..e795bf135e80 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -961,8 +961,8 @@ struct btrfs_dev_replace_item {
 #define BTRFS_BLOCK_GROUP_RAID1         (1ULL << 4)
 #define BTRFS_BLOCK_GROUP_DUP           (1ULL << 5)
 #define BTRFS_BLOCK_GROUP_RAID10        (1ULL << 6)
-#define BTRFS_BLOCK_GROUP_RAID5    (1 << 7)
+#define BTRFS_BLOCK_GROUP_RAID5         (1ULL << 7)
-#define BTRFS_BLOCK_GROUP_RAID6    (1 << 8)
+#define BTRFS_BLOCK_GROUP_RAID6         (1ULL << 8)
 #define BTRFS_BLOCK_GROUP_RESERVED      BTRFS_AVAIL_ALLOC_BIT_SINGLE
 enum btrfs_raid_types {
@@ -1102,6 +1102,18 @@ struct btrfs_space_info {
                                   account */
        /*
+         * bytes_pinned is kept in line with what is actually pinned, as in
+         * we've called update_block_group and dropped the bytes_used counter
+         * and increased the bytes_pinned counter.  However this means that
+         * bytes_pinned does not reflect the bytes that will be pinned once the
+         * delayed refs are flushed, so this counter is inc'ed everytime we call
+         * btrfs_free_extent so it is a realtime count of what will be freed
+         * once the transaction is committed.  It will be zero'ed everytime the
+         * transaction commits.
+         */
+        struct percpu_counter total_bytes_pinned;
+        /*
         * we bump reservation progress every time we decrement
         * bytes_reserved.  This way people waiting for reservations
         * know something good has happened and they can check
@@ -1437,25 +1449,22 @@ struct btrfs_fs_info {
        atomic_t open_ioctl_trans;
        /*
-         * this is used by the balancing code to wait for all the pending
+         * this is used to protect the following list -- ordered_roots.
-         * ordered extents
         */
-        spinlock_t ordered_extent_lock;
+        spinlock_t ordered_root_lock;
        /*
-         * all of the data=ordered extents pending writeback
+         * all fs/file tree roots in which there are data=ordered extents
+         * pending writeback are added into this list.
+         *
         * these can span multiple transactions and basically include
         * every dirty data page that isn't from nodatacow
         */
-        struct list_head ordered_extents;
+        struct list_head ordered_roots;
-        spinlock_t delalloc_lock;
+        spinlock_t delalloc_root_lock;
-        /*
+        /* all fs/file tree roots that have delalloc inodes. */
-         * all of the inodes that have delalloc bytes.  It is possible for
+        struct list_head delalloc_roots;
-         * this list to be empty even when there is still dirty data=ordered
-         * extents waiting to finish IO.
-         */
-        struct list_head delalloc_inodes;
        /*
         * there is a pool of worker threads for checksumming during writes
@@ -1498,8 +1507,6 @@ struct btrfs_fs_info {
        int do_barriers;
        int closing;
        int log_root_recovering;
-        int enospc_unlink;
-        int trans_no_join;
        u64 total_pinned;
@@ -1594,6 +1601,12 @@ struct btrfs_fs_info {
        struct rb_root qgroup_tree;
        spinlock_t qgroup_lock;
+        /*
+         * used to avoid frequently calling ulist_alloc()/ulist_free()
+         * when doing qgroup accounting, it must be protected by qgroup_lock.
+         */
+        struct ulist *qgroup_ulist;
        /* protect user change for quota operations */
        struct mutex qgroup_ioctl_lock;
@@ -1607,6 +1620,8 @@ struct btrfs_fs_info {
        struct mutex qgroup_rescan_lock; /* protects the progress item */
        struct btrfs_key qgroup_rescan_progress;
        struct btrfs_workers qgroup_rescan_workers;
+        struct completion qgroup_rescan_completion;
+        struct btrfs_work qgroup_rescan_work;
        /* filesystem state */
        unsigned long fs_state;
@@ -1739,6 +1754,31 @@ struct btrfs_root {
        int force_cow;
        spinlock_t root_item_lock;
+        atomic_t refs;
+        spinlock_t delalloc_lock;
+        /*
+         * all of the inodes that have delalloc bytes.  It is possible for
+         * this list to be empty even when there is still dirty data=ordered
+         * extents waiting to finish IO.
+         */
+        struct list_head delalloc_inodes;
+        struct list_head delalloc_root;
+        u64 nr_delalloc_inodes;
+        /*
+         * this is used by the balancing code to wait for all the pending
+         * ordered extents
+         */
+        spinlock_t ordered_extent_lock;
+        /*
+         * all of the data=ordered extents pending writeback
+         * these can span multiple transactions and basically include
+         * every dirty data page that isn't from nodatacow
+         */
+        struct list_head ordered_extents;
+        struct list_head ordered_root;
+        u64 nr_ordered_extents;
 };
 struct btrfs_ioctl_defrag_range_args {
@@ -3028,6 +3068,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
                num_items;
 }
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
@@ -3039,6 +3081,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
                                    u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct btrfs_root *root,
+                                 struct extent_buffer *eb);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 objectid, u64 offset, u64 bytenr);
@@ -3155,6 +3199,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes);
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+                             struct btrfs_block_rsv *dest, u64 num_bytes,
+                             int min_factor);
 void btrfs_block_rsv_release(struct btrfs_root *root,
                             struct btrfs_block_rsv *block_rsv,
                             u64 num_bytes);
@@ -3311,6 +3358,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
        smp_mb();
        return fs_info->closing;
 }
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
+{
+        return (root->fs_info->sb->s_flags & MS_RDONLY ||
+                btrfs_fs_closing(root->fs_info));
+}
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
        kfree(fs_info->balance_ctl);
@@ -3357,9 +3416,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
                                   struct btrfs_root_item *item);
 void btrfs_read_root_item(struct extent_buffer *eb, int slot,
                          struct btrfs_root_item *item);
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
-                         btrfs_root_item *item, struct btrfs_key *key);
+                    struct btrfs_path *path, struct btrfs_root_item *root_item,
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+                    struct btrfs_key *root_key);
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
 void btrfs_set_root_node(struct btrfs_root_item *item,
                         struct extent_buffer *node);
@@ -3493,6 +3552,10 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
                                           size_t pg_offset, u64 start, u64 len,
                                           int create);
+noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
+                              struct inode *inode, u64 offset, u64 *len,
+                              u64 *orig_start, u64 *orig_block_len,
+                              u64 *ram_bytes);
 /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
 #if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -3530,6 +3593,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               u32 min_type);
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+                                    int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -3814,6 +3879,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 int btrfs_quota_disable(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
                              struct btrfs_fs_info *fs_info, u64 src, u64 dst);
 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f26f38ccd194..375510913fe7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -535,20 +535,6 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
        return next;
 }
-static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
-                                                   u64 root_id)
-{
-        struct btrfs_key root_key;
-        if (root->objectid == root_id)
-                return root;
-        root_key.objectid = root_id;
-        root_key.type = BTRFS_ROOT_ITEM_KEY;
-        root_key.offset = (u64)-1;
-        return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
-}
 static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
                                               struct btrfs_root *root,
                                               struct btrfs_delayed_item *item)
@@ -1681,8 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
 * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
 *
 */
-int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
-                                    filldir_t filldir,
                                    struct list_head *ins_list)
 {
        struct btrfs_dir_item *di;
@@ -1704,13 +1689,13 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
        list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
                list_del(&curr->readdir_list);
-                if (curr->key.offset < filp->f_pos) {
+                if (curr->key.offset < ctx->pos) {
                        if (atomic_dec_and_test(&curr->refs))
                                kfree(curr);
                        continue;
                }
-                filp->f_pos = curr->key.offset;
+                ctx->pos = curr->key.offset;
                di = (struct btrfs_dir_item *)curr->data;
                name = (char *)(di + 1);
@@ -1719,7 +1704,7 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
                d_type = btrfs_filetype_table[di->type];
                btrfs_disk_key_to_cpu(&location, &di->location);
-                over = filldir(dirent, name, name_len, curr->key.offset,
+                over = !dir_emit(ctx, name, name_len,
                               location.objectid, d_type);
                if (atomic_dec_and_test(&curr->refs))
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 1d5c5f7abe3e..a4b38f934d14 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -139,8 +139,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
                             struct list_head *del_list);
 int btrfs_should_delete_dir_index(struct list_head *del_list,
                                  u64 index);
-int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
-                                    filldir_t filldir,
                                    struct list_head *ins_list);
 /* for init */
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 65241f32d3f8..4253ad580e39 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -400,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
        btrfs_dev_replace_unlock(dev_replace);
-        btrfs_wait_ordered_extents(root, 0);
+        btrfs_wait_all_ordered_extents(root->fs_info, 0);
        /* force writing the updated state information to disk */
        trans = btrfs_start_transaction(root, 0);
@@ -470,12 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         * flush all outstanding I/O and inode extent mappings before the
         * copy operation is declared as being finished
         */
-        ret = btrfs_start_delalloc_inodes(root, 0);
+        ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
        if (ret) {
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                return ret;
        }
-        btrfs_wait_ordered_extents(root, 0);
+        btrfs_wait_all_ordered_extents(root->fs_info, 0);
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b8b60b660c8f..6b092a1c4e37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1013,7 +1013,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
        return try_release_extent_buffer(page);
 }
-static void btree_invalidatepage(struct page *page, unsigned long offset)
+static void btree_invalidatepage(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        struct extent_io_tree *tree;
        tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1191,6 +1192,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->objectid = objectid;
        root->last_trans = 0;
        root->highest_objectid = 0;
+        root->nr_delalloc_inodes = 0;
+        root->nr_ordered_extents = 0;
        root->name = NULL;
        root->inode_tree = RB_ROOT;
        INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
@@ -1199,10 +1202,16 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->root_list);
+        INIT_LIST_HEAD(&root->delalloc_inodes);
+        INIT_LIST_HEAD(&root->delalloc_root);
+        INIT_LIST_HEAD(&root->ordered_extents);
+        INIT_LIST_HEAD(&root->ordered_root);
        INIT_LIST_HEAD(&root->logged_list[0]);
        INIT_LIST_HEAD(&root->logged_list[1]);
        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
+        spin_lock_init(&root->delalloc_lock);
+        spin_lock_init(&root->ordered_extent_lock);
        spin_lock_init(&root->accounting_lock);
        spin_lock_init(&root->log_extents_lock[0]);
        spin_lock_init(&root->log_extents_lock[1]);
@@ -1216,6 +1225,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        atomic_set(&root->log_writers, 0);
        atomic_set(&root->log_batch, 0);
        atomic_set(&root->orphan_inodes, 0);
+        atomic_set(&root->refs, 1);
        root->log_transid = 0;
        root->last_log_commit = 0;
        extent_io_tree_init(&root->dirty_log_pages,
@@ -1234,39 +1244,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        spin_lock_init(&root->root_item_lock);
 }
-static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
-                                            struct btrfs_fs_info *fs_info,
-                                            u64 objectid,
-                                            struct btrfs_root *root)
-{
-        int ret;
-        u32 blocksize;
-        u64 generation;
-        __setup_root(tree_root->nodesize, tree_root->leafsize,
-                     tree_root->sectorsize, tree_root->stripesize,
-                     root, fs_info, objectid);
-        ret = btrfs_find_last_root(tree_root, objectid,
-                                   &root->root_item, &root->root_key);
-        if (ret > 0)
-                return -ENOENT;
-        else if (ret < 0)
-                return ret;
-        generation = btrfs_root_generation(&root->root_item);
-        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
-        root->commit_root = NULL;
-        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-                                     blocksize, generation);
-        if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
-                free_extent_buffer(root->node);
-                root->node = NULL;
-                return -EIO;
-        }
-        root->commit_root = btrfs_root_node(root);
-        return 0;
-}
 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
 {
        struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
@@ -1451,70 +1428,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
        return 0;
 }
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
-                                               struct btrfs_key *location)
+                                        struct btrfs_key *key)
 {
        struct btrfs_root *root;
        struct btrfs_fs_info *fs_info = tree_root->fs_info;
        struct btrfs_path *path;
-        struct extent_buffer *l;
        u64 generation;
        u32 blocksize;
-        int ret = 0;
+        int ret;
-        int slot;
-        root = btrfs_alloc_root(fs_info);
+        path = btrfs_alloc_path();
-        if (!root)
+        if (!path)
                return ERR_PTR(-ENOMEM);
-        if (location->offset == (u64)-1) {
-                ret = find_and_setup_root(tree_root, fs_info,
+        root = btrfs_alloc_root(fs_info);
-                                          location->objectid, root);
+        if (!root) {
-                if (ret) {
+                ret = -ENOMEM;
-                        kfree(root);
+                goto alloc_fail;
-                        return ERR_PTR(ret);
-                }
-                goto out;
        }
        __setup_root(tree_root->nodesize, tree_root->leafsize,
                     tree_root->sectorsize, tree_root->stripesize,
-                     root, fs_info, location->objectid);
+                     root, fs_info, key->objectid);
-        path = btrfs_alloc_path();
+        ret = btrfs_find_root(tree_root, key, path,
-        if (!path) {
+                              &root->root_item, &root->root_key);
-                kfree(root);
-                return ERR_PTR(-ENOMEM);
-        }
-        ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
-        if (ret == 0) {
-                l = path->nodes[0];
-                slot = path->slots[0];
-                btrfs_read_root_item(l, slot, &root->root_item);
-                memcpy(&root->root_key, location, sizeof(*location));
-        }
-        btrfs_free_path(path);
        if (ret) {
-                kfree(root);
                if (ret > 0)
                        ret = -ENOENT;
-                return ERR_PTR(ret);
+                goto find_fail;
        }
        generation = btrfs_root_generation(&root->root_item);
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
-        if (!root->node || !extent_buffer_uptodate(root->node)) {
+        if (!root->node) {
-                ret = (!root->node) ? -ENOMEM : -EIO;
+                ret = -ENOMEM;
+                goto find_fail;
-                free_extent_buffer(root->node);
+        } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
-                kfree(root);
+                ret = -EIO;
-                return ERR_PTR(ret);
+                goto read_fail;
        }
        root->commit_root = btrfs_root_node(root);
 out:
-        if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+        btrfs_free_path(path);
+        return root;
+read_fail:
+        free_extent_buffer(root->node);
+find_fail:
+        kfree(root);
+alloc_fail:
+        root = ERR_PTR(ret);
+        goto out;
+}
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+                                      struct btrfs_key *location)
+{
+        struct btrfs_root *root;
+        root = btrfs_read_tree_root(tree_root, location);
+        if (IS_ERR(root))
+                return root;
+        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                root->ref_cows = 1;
                btrfs_check_and_init_root_item(&root->root_item);
        }
@@ -1522,6 +1502,66 @@ out:
        return root;
 }
+int btrfs_init_fs_root(struct btrfs_root *root)
+{
+        int ret;
+        root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+        root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+                                        GFP_NOFS);
+        if (!root->free_ino_pinned || !root->free_ino_ctl) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        btrfs_init_free_ino_ctl(root);
+        mutex_init(&root->fs_commit_mutex);
+        spin_lock_init(&root->cache_lock);
+        init_waitqueue_head(&root->cache_wait);
+        ret = get_anon_bdev(&root->anon_dev);
+        if (ret)
+                goto fail;
+        return 0;
+fail:
+        kfree(root->free_ino_ctl);
+        kfree(root->free_ino_pinned);
+        return ret;
+}
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+                                        u64 root_id)
+{
+        struct btrfs_root *root;
+        spin_lock(&fs_info->fs_roots_radix_lock);
+        root = radix_tree_lookup(&fs_info->fs_roots_radix,
+                                 (unsigned long)root_id);
+        spin_unlock(&fs_info->fs_roots_radix_lock);
+        return root;
+}
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+                         struct btrfs_root *root)
+{
+        int ret;
+        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+        if (ret)
+                return ret;
+        spin_lock(&fs_info->fs_roots_radix_lock);
+        ret = radix_tree_insert(&fs_info->fs_roots_radix,
+                                (unsigned long)root->root_key.objectid,
+                                root);
+        if (ret == 0)
+                root->in_radix = 1;
+        spin_unlock(&fs_info->fs_roots_radix_lock);
+        radix_tree_preload_end();
+        return ret;
+}
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                                              struct btrfs_key *location)
 {
@@ -1542,58 +1582,30 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                return fs_info->quota_root ? fs_info->quota_root :
                                             ERR_PTR(-ENOENT);
 again:
-        spin_lock(&fs_info->fs_roots_radix_lock);
+        root = btrfs_lookup_fs_root(fs_info, location->objectid);
-        root = radix_tree_lookup(&fs_info->fs_roots_radix,
-                                 (unsigned long)location->objectid);
-        spin_unlock(&fs_info->fs_roots_radix_lock);
        if (root)
                return root;
-        root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+        root = btrfs_read_fs_root(fs_info->tree_root, location);
        if (IS_ERR(root))
                return root;
-        root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+        if (btrfs_root_refs(&root->root_item) == 0) {
-        root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+                ret = -ENOENT;
-                                        GFP_NOFS);
-        if (!root->free_ino_pinned || !root->free_ino_ctl) {
-                ret = -ENOMEM;
                goto fail;
        }
-        btrfs_init_free_ino_ctl(root);
+        ret = btrfs_init_fs_root(root);
-        mutex_init(&root->fs_commit_mutex);
-        spin_lock_init(&root->cache_lock);
-        init_waitqueue_head(&root->cache_wait);
-        ret = get_anon_bdev(&root->anon_dev);
        if (ret)
                goto fail;
-        if (btrfs_root_refs(&root->root_item) == 0) {
-                ret = -ENOENT;
-                goto fail;
-        }
        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
        if (ret < 0)
                goto fail;
        if (ret == 0)
                root->orphan_item_inserted = 1;
-        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+        ret = btrfs_insert_fs_root(fs_info, root);
-        if (ret)
-                goto fail;
-        spin_lock(&fs_info->fs_roots_radix_lock);
-        ret = radix_tree_insert(&fs_info->fs_roots_radix,
-                                (unsigned long)root->root_key.objectid,
-                                root);
-        if (ret == 0)
-                root->in_radix = 1;
-        spin_unlock(&fs_info->fs_roots_radix_lock);
-        radix_tree_preload_end();
        if (ret) {
                if (ret == -EEXIST) {
                        free_fs_root(root);
@@ -1601,10 +1613,6 @@ again:
                }
                goto fail;
        }
-        ret = btrfs_find_dead_roots(fs_info->tree_root,
-                                    root->root_key.objectid);
-        WARN_ON(ret);
        return root;
 fail:
        free_fs_root(root);
@@ -1676,21 +1684,37 @@ static void end_workqueue_fn(struct btrfs_work *work)
 static int cleaner_kthread(void *arg)
 {
        struct btrfs_root *root = arg;
+        int again;
        do {
-                int again = 0;
+                again = 0;
-                if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
+                /* Make the cleaner go to sleep early. */
-                    down_read_trylock(&root->fs_info->sb->s_umount)) {
+                if (btrfs_need_cleaner_sleep(root))
-                        if (mutex_trylock(&root->fs_info->cleaner_mutex)) {
+                        goto sleep;
-                                btrfs_run_delayed_iputs(root);
-                                again = btrfs_clean_one_deleted_snapshot(root);
+                if (!mutex_trylock(&root->fs_info->cleaner_mutex))
-                                mutex_unlock(&root->fs_info->cleaner_mutex);
+                        goto sleep;
-                        }
-                        btrfs_run_defrag_inodes(root->fs_info);
+                /*
-                        up_read(&root->fs_info->sb->s_umount);
+                 * Avoid the problem that we change the status of the fs
+                 * during the above check and trylock.
+                 */
+                if (btrfs_need_cleaner_sleep(root)) {
+                        mutex_unlock(&root->fs_info->cleaner_mutex);
+                        goto sleep;
                }
+                btrfs_run_delayed_iputs(root);
+                again = btrfs_clean_one_deleted_snapshot(root);
+                mutex_unlock(&root->fs_info->cleaner_mutex);
+                /*
+                 * The defragger has dealt with the R/O remount and umount,
+                 * needn't do anything special here.
+                 */
+                btrfs_run_defrag_inodes(root->fs_info);
+sleep:
                if (!try_to_freeze() && !again) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        if (!kthread_should_stop())
@@ -1724,7 +1748,7 @@ static int transaction_kthread(void *arg)
                }
                now = get_seconds();
-                if (!cur->blocked &&
+                if (cur->state < TRANS_STATE_BLOCKED &&
                    (now < cur->start_time || now - cur->start_time < 30)) {
                        spin_unlock(&root->fs_info->trans_lock);
                        delay = HZ * 5;
@@ -2034,11 +2058,11 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
                list_del(&gang[0]->root_list);
                if (gang[0]->in_radix) {
-                        btrfs_free_fs_root(fs_info, gang[0]);
+                        btrfs_drop_and_free_fs_root(fs_info, gang[0]);
                } else {
                        free_extent_buffer(gang[0]->node);
                        free_extent_buffer(gang[0]->commit_root);
-                        kfree(gang[0]);
+                        btrfs_put_fs_root(gang[0]);
                }
        }
@@ -2049,7 +2073,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
                if (!ret)
                        break;
                for (i = 0; i < ret; i++)
-                        btrfs_free_fs_root(fs_info, gang[i]);
+                        btrfs_drop_and_free_fs_root(fs_info, gang[i]);
        }
 }
@@ -2081,14 +2105,8 @@ int open_ctree(struct super_block *sb,
        int backup_index = 0;
        tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
-        extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
-        csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
        chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
-        dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
+        if (!tree_root || !chunk_root) {
-        quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
-        if (!tree_root || !extent_root || !csum_root ||
-            !chunk_root || !dev_root || !quota_root) {
                err = -ENOMEM;
                goto fail;
        }
@@ -2131,9 +2149,9 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
-        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+        INIT_LIST_HEAD(&fs_info->delalloc_roots);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
-        spin_lock_init(&fs_info->delalloc_lock);
+        spin_lock_init(&fs_info->delalloc_root_lock);
        spin_lock_init(&fs_info->trans_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
@@ -2169,7 +2187,6 @@ int open_ctree(struct super_block *sb,
        fs_info->max_inline = 8192 * 1024;
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
-        fs_info->trans_no_join = 0;
        fs_info->free_chunk_space = 0;
        fs_info->tree_mod_log = RB_ROOT;
@@ -2180,8 +2197,8 @@ int open_ctree(struct super_block *sb,
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
-        INIT_LIST_HEAD(&fs_info->ordered_extents);
+        INIT_LIST_HEAD(&fs_info->ordered_roots);
-        spin_lock_init(&fs_info->ordered_extent_lock);
+        spin_lock_init(&fs_info->ordered_root_lock);
        fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
                                        GFP_NOFS);
        if (!fs_info->delayed_root) {
@@ -2274,6 +2291,7 @@ int open_ctree(struct super_block *sb,
        fs_info->qgroup_seq = 1;
        fs_info->quota_enabled = 0;
        fs_info->pending_quota_state = 0;
+        fs_info->qgroup_ulist = NULL;
        mutex_init(&fs_info->qgroup_rescan_lock);
        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@ -2638,33 +2656,44 @@ retry_root_backup:
        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
        tree_root->commit_root = btrfs_root_node(tree_root);
-        ret = find_and_setup_root(tree_root, fs_info,
+        location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
-                                  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
+        location.type = BTRFS_ROOT_ITEM_KEY;
-        if (ret)
+        location.offset = 0;
+        extent_root = btrfs_read_tree_root(tree_root, &location);
+        if (IS_ERR(extent_root)) {
+                ret = PTR_ERR(extent_root);
                goto recovery_tree_root;
+        }
        extent_root->track_dirty = 1;
+        fs_info->extent_root = extent_root;
-        ret = find_and_setup_root(tree_root, fs_info,
+        location.objectid = BTRFS_DEV_TREE_OBJECTID;
-                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
+        dev_root = btrfs_read_tree_root(tree_root, &location);
-        if (ret)
+        if (IS_ERR(dev_root)) {
+                ret = PTR_ERR(dev_root);
                goto recovery_tree_root;
+        }
        dev_root->track_dirty = 1;
+        fs_info->dev_root = dev_root;
+        btrfs_init_devices_late(fs_info);
-        ret = find_and_setup_root(tree_root, fs_info,
+        location.objectid = BTRFS_CSUM_TREE_OBJECTID;
-                                  BTRFS_CSUM_TREE_OBJECTID, csum_root);
+        csum_root = btrfs_read_tree_root(tree_root, &location);
-        if (ret)
+        if (IS_ERR(csum_root)) {
+                ret = PTR_ERR(csum_root);
                goto recovery_tree_root;
+        }
        csum_root->track_dirty = 1;
+        fs_info->csum_root = csum_root;
-        ret = find_and_setup_root(tree_root, fs_info,
+        location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
-                                  BTRFS_QUOTA_TREE_OBJECTID, quota_root);
+        quota_root = btrfs_read_tree_root(tree_root, &location);
-        if (ret) {
+        if (!IS_ERR(quota_root)) {
-                kfree(quota_root);
-                quota_root = fs_info->quota_root = NULL;
-        } else {
                quota_root->track_dirty = 1;
                fs_info->quota_enabled = 1;
                fs_info->pending_quota_state = 1;
+                fs_info->quota_root = quota_root;
        }
        fs_info->generation = generation;
@@ -2817,11 +2846,9 @@ retry_root_backup:
        location.objectid = BTRFS_FS_TREE_OBJECTID;
        location.type = BTRFS_ROOT_ITEM_KEY;
-        location.offset = (u64)-1;
+        location.offset = 0;
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
-        if (!fs_info->fs_root)
-                goto fail_qgroup;
        if (IS_ERR(fs_info->fs_root)) {
                err = PTR_ERR(fs_info->fs_root);
                goto fail_qgroup;
@@ -2853,6 +2880,8 @@ retry_root_backup:
                return ret;
        }
+        btrfs_qgroup_rescan_resume(fs_info);
        return 0;
 fail_qgroup:
@@ -3258,7 +3287,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
                                            BTRFS_BLOCK_GROUP_RAID10)) {
                                                num_tolerated_disk_barrier_failures = 1;
                                        } else if (flags &
-                                                   BTRFS_BLOCK_GROUP_RAID5) {
+                                                   BTRFS_BLOCK_GROUP_RAID6) {
                                                num_tolerated_disk_barrier_failures = 2;
                                        }
                                }
@@ -3366,7 +3395,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
        return ret;
 }
-void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+/* Drop a fs root from the radix tree and free it. */
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+                                  struct btrfs_root *root)
 {
        spin_lock(&fs_info->fs_roots_radix_lock);
        radix_tree_delete(&fs_info->fs_roots_radix,
@@ -3397,7 +3428,12 @@ static void free_fs_root(struct btrfs_root *root)
        kfree(root->free_ino_ctl);
        kfree(root->free_ino_pinned);
        kfree(root->name);
-        kfree(root);
+        btrfs_put_fs_root(root);
+}
+void btrfs_free_fs_root(struct btrfs_root *root)
+{
+        free_fs_root(root);
 }
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3653,7 +3689,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
        INIT_LIST_HEAD(&splice);
        mutex_lock(&root->fs_info->ordered_operations_mutex);
-        spin_lock(&root->fs_info->ordered_extent_lock);
+        spin_lock(&root->fs_info->ordered_root_lock);
        list_splice_init(&t->ordered_operations, &splice);
        while (!list_empty(&splice)) {
@@ -3661,14 +3697,14 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
                                         ordered_operations);
                list_del_init(&btrfs_inode->ordered_operations);
-                spin_unlock(&root->fs_info->ordered_extent_lock);
+                spin_unlock(&root->fs_info->ordered_root_lock);
                btrfs_invalidate_inodes(btrfs_inode->root);
-                spin_lock(&root->fs_info->ordered_extent_lock);
+                spin_lock(&root->fs_info->ordered_root_lock);
        }
-        spin_unlock(&root->fs_info->ordered_extent_lock);
+        spin_unlock(&root->fs_info->ordered_root_lock);
        mutex_unlock(&root->fs_info->ordered_operations_mutex);
 }
@@ -3676,15 +3712,36 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
 {
        struct btrfs_ordered_extent *ordered;
-        spin_lock(&root->fs_info->ordered_extent_lock);
+        spin_lock(&root->ordered_extent_lock);
        /*
         * This will just short circuit the ordered completion stuff which will
         * make sure the ordered extent gets properly cleaned up.
         */
-        list_for_each_entry(ordered, &root->fs_info->ordered_extents,
+        list_for_each_entry(ordered, &root->ordered_extents,
                            root_extent_list)
                set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
-        spin_unlock(&root->fs_info->ordered_extent_lock);
+        spin_unlock(&root->ordered_extent_lock);
+}
+static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_root *root;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        spin_lock(&fs_info->ordered_root_lock);
+        list_splice_init(&fs_info->ordered_roots, &splice);
+        while (!list_empty(&splice)) {
+                root = list_first_entry(&splice, struct btrfs_root,
+                                        ordered_root);
+                list_del_init(&root->ordered_root);
+                btrfs_destroy_ordered_extents(root);
+                cond_resched_lock(&fs_info->ordered_root_lock);
+        }
+        spin_unlock(&fs_info->ordered_root_lock);
 }
 int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -3706,6 +3763,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
        while ((node = rb_first(&delayed_refs->root)) != NULL) {
                struct btrfs_delayed_ref_head *head = NULL;
+                bool pin_bytes = false;
                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
                atomic_set(&ref->refs, 1);
@@ -3726,8 +3784,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                        }
                        if (head->must_insert_reserved)
-                                btrfs_pin_extent(root, ref->bytenr,
+                                pin_bytes = true;
-                                                 ref->num_bytes, 1);
                        btrfs_free_delayed_extent_op(head->extent_op);
                        delayed_refs->num_heads--;
                        if (list_empty(&head->cluster))
@@ -3738,9 +3795,13 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
-                if (head)
-                        mutex_unlock(&head->mutex);
                spin_unlock(&delayed_refs->lock);
+                if (head) {
+                        if (pin_bytes)
+                                btrfs_pin_extent(root, ref->bytenr,
+                                                 ref->num_bytes, 1);
+                        mutex_unlock(&head->mutex);
+                }
                btrfs_put_delayed_ref(ref);
                cond_resched();
@@ -3777,24 +3838,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
        INIT_LIST_HEAD(&splice);
-        spin_lock(&root->fs_info->delalloc_lock);
+        spin_lock(&root->delalloc_lock);
-        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+        list_splice_init(&root->delalloc_inodes, &splice);
        while (!list_empty(&splice)) {
-                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
-                                    delalloc_inodes);
+                                               delalloc_inodes);
                list_del_init(&btrfs_inode->delalloc_inodes);
                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
                          &btrfs_inode->runtime_flags);
-                spin_unlock(&root->fs_info->delalloc_lock);
+                spin_unlock(&root->delalloc_lock);
                btrfs_invalidate_inodes(btrfs_inode->root);
-                spin_lock(&root->fs_info->delalloc_lock);
+                spin_lock(&root->delalloc_lock);
        }
-        spin_unlock(&root->fs_info->delalloc_lock);
+        spin_unlock(&root->delalloc_lock);
+}
+static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_root *root;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        spin_lock(&fs_info->delalloc_root_lock);
+        list_splice_init(&fs_info->delalloc_roots, &splice);
+        while (!list_empty(&splice)) {
+                root = list_first_entry(&splice, struct btrfs_root,
+                                         delalloc_root);
+                list_del_init(&root->delalloc_root);
+                root = btrfs_grab_fs_root(root);
+                BUG_ON(!root);
+                spin_unlock(&fs_info->delalloc_root_lock);
+                btrfs_destroy_delalloc_inodes(root);
+                btrfs_put_fs_root(root);
+                spin_lock(&fs_info->delalloc_root_lock);
+        }
+        spin_unlock(&fs_info->delalloc_root_lock);
 }
 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3878,19 +3964,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
        btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
                                cur_trans->dirty_pages.dirty_bytes);
-        /* FIXME: cleanup wait for commit */
+        cur_trans->state = TRANS_STATE_COMMIT_START;
-        cur_trans->in_commit = 1;
-        cur_trans->blocked = 1;
        wake_up(&root->fs_info->transaction_blocked_wait);
        btrfs_evict_pending_snapshots(cur_trans);
-        cur_trans->blocked = 0;
+        cur_trans->state = TRANS_STATE_UNBLOCKED;
        wake_up(&root->fs_info->transaction_wait);
-        cur_trans->commit_done = 1;
-        wake_up(&cur_trans->commit_wait);
        btrfs_destroy_delayed_inodes(root);
        btrfs_assert_delayed_root_empty(root);
@@ -3899,6 +3980,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
        btrfs_destroy_pinned_extent(root,
                                    root->fs_info->pinned_extents);
+        cur_trans->state =TRANS_STATE_COMPLETED;
+        wake_up(&cur_trans->commit_wait);
        /*
        memset(cur_trans, 0, sizeof(*cur_trans));
        kmem_cache_free(btrfs_transaction_cachep, cur_trans);
@@ -3914,7 +3998,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
        spin_lock(&root->fs_info->trans_lock);
        list_splice_init(&root->fs_info->trans_list, &list);
-        root->fs_info->trans_no_join = 1;
+        root->fs_info->running_transaction = NULL;
        spin_unlock(&root->fs_info->trans_lock);
        while (!list_empty(&list)) {
@@ -3922,37 +4006,31 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
                btrfs_destroy_ordered_operations(t, root);
-                btrfs_destroy_ordered_extents(root);
+                btrfs_destroy_all_ordered_extents(root->fs_info);
                btrfs_destroy_delayed_refs(t, root);
-                /* FIXME: cleanup wait for commit */
+                /*
-                t->in_commit = 1;
+                 *  FIXME: cleanup wait for commit
-                t->blocked = 1;
+                 *  We needn't acquire the lock here, because we are during
+                 *  the umount, there is no other task which will change it.
+                 */
+                t->state = TRANS_STATE_COMMIT_START;
                smp_mb();
                if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
                        wake_up(&root->fs_info->transaction_blocked_wait);
                btrfs_evict_pending_snapshots(t);
-                t->blocked = 0;
+                t->state = TRANS_STATE_UNBLOCKED;
                smp_mb();
                if (waitqueue_active(&root->fs_info->transaction_wait))
                        wake_up(&root->fs_info->transaction_wait);
-                t->commit_done = 1;
-                smp_mb();
-                if (waitqueue_active(&t->commit_wait))
-                        wake_up(&t->commit_wait);
                btrfs_destroy_delayed_inodes(root);
                btrfs_assert_delayed_root_empty(root);
-                btrfs_destroy_delalloc_inodes(root);
+                btrfs_destroy_all_delalloc_inodes(root->fs_info);
-                spin_lock(&root->fs_info->trans_lock);
-                root->fs_info->running_transaction = NULL;
-                spin_unlock(&root->fs_info->trans_lock);
                btrfs_destroy_marked_extents(root, &t->dirty_pages,
                                             EXTENT_DIRTY);
@@ -3960,15 +4038,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
                btrfs_destroy_pinned_extent(root,
                                            root->fs_info->pinned_extents);
+                t->state = TRANS_STATE_COMPLETED;
+                smp_mb();
+                if (waitqueue_active(&t->commit_wait))
+                        wake_up(&t->commit_wait);
                atomic_set(&t->use_count, 0);
                list_del_init(&t->list);
                memset(t, 0, sizeof(*t));
                kmem_cache_free(btrfs_transaction_cachep, t);
        }
-        spin_lock(&root->fs_info->trans_lock);
-        root->fs_info->trans_no_join = 0;
-        spin_unlock(&root->fs_info->trans_lock);
        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
        return 0;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index be69ce1b07a2..b71acd6e1e5b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -63,14 +63,40 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize);
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
-                                               struct btrfs_key *location);
+                                      struct btrfs_key *location);
+int btrfs_init_fs_root(struct btrfs_root *root);
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+                         struct btrfs_root *root);
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                                              struct btrfs_key *location);
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
 void btrfs_btree_balance_dirty(struct btrfs_root *root);
 void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
-void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_root *root);
+void btrfs_free_fs_root(struct btrfs_root *root);
+/*
+ * This function is used to grab the root, and avoid it is freed when we
+ * access it. But it doesn't ensure that the tree is not dropped.
+ *
+ * If you want to ensure the whole tree is safe, you should use
+ *      fs_info->subvol_srcu
+ */
+static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+{
+        if (atomic_inc_not_zero(&root->refs))
+                return root;
+        return NULL;
+}
+static inline void btrfs_put_fs_root(struct btrfs_root *root)
+{
+        if (atomic_dec_and_test(&root->refs))
+                kfree(root);
+}
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
                          int atomic);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 81ee29eeb7ca..4b8691607373 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
                goto fail;
        }
-        if (btrfs_root_refs(&root->root_item) == 0) {
-                err = -ENOENT;
-                goto fail;
-        }
        key.objectid = objectid;
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
        key.offset = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df472ab1b5ac..1204c8ef6f32 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -24,6 +24,7 @@
 #include <linux/kthread.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <linux/percpu_counter.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -2526,6 +2527,51 @@ static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
        return 0;
 }
+static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
+{
+        u64 num_bytes;
+        num_bytes = heads * (sizeof(struct btrfs_extent_item) +
+                             sizeof(struct btrfs_extent_inline_ref));
+        if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+                num_bytes += heads * sizeof(struct btrfs_tree_block_info);
+        /*
+         * We don't ever fill up leaves all the way so multiply by 2 just to be
+         * closer to what we're really going to want to ouse.
+         */
+        return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root)
+{
+        struct btrfs_block_rsv *global_rsv;
+        u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
+        u64 num_bytes;
+        int ret = 0;
+        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+        num_heads = heads_to_leaves(root, num_heads);
+        if (num_heads > 1)
+                num_bytes += (num_heads - 1) * root->leafsize;
+        num_bytes <<= 1;
+        global_rsv = &root->fs_info->global_block_rsv;
+        /*
+         * If we can't allocate any more chunks lets make sure we have _lots_ of
+         * wiggle room since running delayed refs can create more delayed refs.
+         */
+        if (global_rsv->space_info->full)
+                num_bytes <<= 1;
+        spin_lock(&global_rsv->lock);
+        if (global_rsv->reserved <= num_bytes)
+                ret = 1;
+        spin_unlock(&global_rsv->lock);
+        return ret;
+}
 /*
 * this starts processing the delayed reference count updates and
 * extent insertions we have queued up so far.  count can be
@@ -2573,7 +2619,8 @@ progress:
                old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
                if (old) {
                        DEFINE_WAIT(__wait);
-                        if (delayed_refs->num_entries < 16348)
+                        if (delayed_refs->flushing ||
+                            !btrfs_should_throttle_delayed_refs(trans, root))
                                return 0;
                        prepare_to_wait(&delayed_refs->wait, &__wait,
@@ -2608,7 +2655,7 @@ again:
        while (1) {
                if (!(run_all || run_most) &&
-                    delayed_refs->num_heads_ready < 64)
+                    !btrfs_should_throttle_delayed_refs(trans, root))
                        break;
                /*
@@ -2629,6 +2676,7 @@ again:
                        spin_unlock(&delayed_refs->lock);
                        btrfs_abort_transaction(trans, root, ret);
                        atomic_dec(&delayed_refs->procs_running_refs);
+                        wake_up(&delayed_refs->wait);
                        return ret;
                }
@@ -3310,6 +3358,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        struct btrfs_space_info *found;
        int i;
        int factor;
+        int ret;
        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
                     BTRFS_BLOCK_GROUP_RAID10))
@@ -3333,6 +3382,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        if (!found)
                return -ENOMEM;
+        ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+        if (ret) {
+                kfree(found);
+                return ret;
+        }
        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
                INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
@@ -3565,10 +3620,11 @@ alloc:
                }
                /*
-                 * If we have less pinned bytes than we want to allocate then
+                 * If we don't have enough pinned space to deal with this
-                 * don't bother committing the transaction, it won't help us.
+                 * allocation don't bother committing the transaction.
                 */
-                if (data_sinfo->bytes_pinned < bytes)
+                if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
+                                           bytes) < 0)
                        committed = 1;
                spin_unlock(&data_sinfo->lock);
@@ -3577,6 +3633,7 @@ commit_trans:
                if (!committed &&
                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
                        committed = 1;
                        trans = btrfs_join_transaction(root);
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
@@ -3609,6 +3666,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
        data_sinfo = root->fs_info->data_sinfo;
        spin_lock(&data_sinfo->lock);
+        WARN_ON(data_sinfo->bytes_may_use < bytes);
        data_sinfo->bytes_may_use -= bytes;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
                                      data_sinfo->flags, bytes, 0);
@@ -3886,12 +3944,11 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
                                         unsigned long nr_pages)
 {
        struct super_block *sb = root->fs_info->sb;
-        int started;
-        /* If we can not start writeback, just sync all the delalloc file. */
+        if (down_read_trylock(&sb->s_umount)) {
-        started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
+                writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
-                                                      WB_REASON_FS_FREE_SPACE);
+                up_read(&sb->s_umount);
-        if (!started) {
+        } else {
                /*
                 * We needn't worry the filesystem going from r/w to r/o though
                 * we don't acquire ->s_umount mutex, because the filesystem
@@ -3899,9 +3956,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
                 * the filesystem is readonly(all dirty pages are written to
                 * the disk).
                 */
-                btrfs_start_delalloc_inodes(root, 0);
+                btrfs_start_all_delalloc_inodes(root->fs_info, 0);
                if (!current->journal_info)
-                        btrfs_wait_ordered_extents(root, 0);
+                        btrfs_wait_all_ordered_extents(root->fs_info, 0);
        }
 }
@@ -3931,7 +3988,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        if (delalloc_bytes == 0) {
                if (trans)
                        return;
-                btrfs_wait_ordered_extents(root, 0);
+                btrfs_wait_all_ordered_extents(root->fs_info, 0);
                return;
        }
@@ -3959,7 +4016,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                loops++;
                if (wait_ordered && !trans) {
-                        btrfs_wait_ordered_extents(root, 0);
+                        btrfs_wait_all_ordered_extents(root->fs_info, 0);
                } else {
                        time_left = schedule_timeout_killable(1);
                        if (time_left)
@@ -3997,7 +4054,8 @@ static int may_commit_transaction(struct btrfs_root *root,
        /* See if there is enough pinned space to make this reservation */
        spin_lock(&space_info->lock);
-        if (space_info->bytes_pinned >= bytes) {
+        if (percpu_counter_compare(&space_info->total_bytes_pinned,
+                                   bytes) >= 0) {
                spin_unlock(&space_info->lock);
                goto commit;
        }
@@ -4012,7 +4070,8 @@ static int may_commit_transaction(struct btrfs_root *root,
        spin_lock(&space_info->lock);
        spin_lock(&delayed_rsv->lock);
-        if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
+        if (percpu_counter_compare(&space_info->total_bytes_pinned,
+                                   bytes - delayed_rsv->size) >= 0) {
                spin_unlock(&delayed_rsv->lock);
                spin_unlock(&space_info->lock);
                return -ENOSPC;
@@ -4297,6 +4356,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
        spin_unlock(&block_rsv->lock);
 }
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+                             struct btrfs_block_rsv *dest, u64 num_bytes,
+                             int min_factor)
+{
+        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+        u64 min_bytes;
+        if (global_rsv->space_info != dest->space_info)
+                return -ENOSPC;
+        spin_lock(&global_rsv->lock);
+        min_bytes = div_factor(global_rsv->size, min_factor);
+        if (global_rsv->reserved < min_bytes + num_bytes) {
+                spin_unlock(&global_rsv->lock);
+                return -ENOSPC;
+        }
+        global_rsv->reserved -= num_bytes;
+        if (global_rsv->reserved < global_rsv->size)
+                global_rsv->full = 0;
+        spin_unlock(&global_rsv->lock);
+        block_rsv_add_bytes(dest, num_bytes, 1);
+        return 0;
+}
 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
                                    struct btrfs_block_rsv *block_rsv,
                                    struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5030,14 +5114,14 @@ static int update_block_group(struct btrfs_root *root,
        int factor;
        /* block accounting for super block */
-        spin_lock(&info->delalloc_lock);
+        spin_lock(&info->delalloc_root_lock);
        old_val = btrfs_super_bytes_used(info->super_copy);
        if (alloc)
                old_val += num_bytes;
        else
                old_val -= num_bytes;
        btrfs_set_super_bytes_used(info->super_copy, old_val);
-        spin_unlock(&info->delalloc_lock);
+        spin_unlock(&info->delalloc_root_lock);
        while (total) {
                cache = btrfs_lookup_block_group(info, bytenr);
@@ -5189,6 +5273,80 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
        return ret;
 }
+static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
+{
+        int ret;
+        struct btrfs_block_group_cache *block_group;
+        struct btrfs_caching_control *caching_ctl;
+        block_group = btrfs_lookup_block_group(root->fs_info, start);
+        if (!block_group)
+                return -EINVAL;
+        cache_block_group(block_group, 0);
+        caching_ctl = get_caching_control(block_group);
+        if (!caching_ctl) {
+                /* Logic error */
+                BUG_ON(!block_group_cache_done(block_group));
+                ret = btrfs_remove_free_space(block_group, start, num_bytes);
+        } else {
+                mutex_lock(&caching_ctl->mutex);
+                if (start >= caching_ctl->progress) {
+                        ret = add_excluded_extent(root, start, num_bytes);
+                } else if (start + num_bytes <= caching_ctl->progress) {
+                        ret = btrfs_remove_free_space(block_group,
+                                                      start, num_bytes);
+                } else {
+                        num_bytes = caching_ctl->progress - start;
+                        ret = btrfs_remove_free_space(block_group,
+                                                      start, num_bytes);
+                        if (ret)
+                                goto out_lock;
+                        num_bytes = (start + num_bytes) -
+                                caching_ctl->progress;
+                        start = caching_ctl->progress;
+                        ret = add_excluded_extent(root, start, num_bytes);
+                }
+out_lock:
+                mutex_unlock(&caching_ctl->mutex);
+                put_caching_control(caching_ctl);
+        }
+        btrfs_put_block_group(block_group);
+        return ret;
+}
+int btrfs_exclude_logged_extents(struct btrfs_root *log,
+                                 struct extent_buffer *eb)
+{
+        struct btrfs_file_extent_item *item;
+        struct btrfs_key key;
+        int found_type;
+        int i;
+        if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
+                return 0;
+        for (i = 0; i < btrfs_header_nritems(eb); i++) {
+                btrfs_item_key_to_cpu(eb, &key, i);
+                if (key.type != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+                found_type = btrfs_file_extent_type(eb, item);
+                if (found_type == BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+                        continue;
+                key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+                key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+                __exclude_logged_extent(log, key.objectid, key.offset);
+        }
+        return 0;
+}
 /**
 * btrfs_update_reserved_bytes - update the block_group and space info counters
 * @cache:      The cache we are manipulating
@@ -5251,6 +5409,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
        struct btrfs_caching_control *next;
        struct btrfs_caching_control *caching_ctl;
        struct btrfs_block_group_cache *cache;
+        struct btrfs_space_info *space_info;
        down_write(&fs_info->extent_commit_sem);
@@ -5273,6 +5432,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
        up_write(&fs_info->extent_commit_sem);
+        list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
+                percpu_counter_set(&space_info->total_bytes_pinned, 0);
        update_global_block_rsv(fs_info);
 }
@@ -5370,6 +5532,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
        return 0;
 }
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
+                             u64 owner, u64 root_objectid)
+{
+        struct btrfs_space_info *space_info;
+        u64 flags;
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+                if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
+                        flags = BTRFS_BLOCK_GROUP_SYSTEM;
+                else
+                        flags = BTRFS_BLOCK_GROUP_METADATA;
+        } else {
+                flags = BTRFS_BLOCK_GROUP_DATA;
+        }
+        space_info = __find_space_info(fs_info, flags);
+        BUG_ON(!space_info); /* Logic bug */
+        percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
+}
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@ -5590,6 +5773,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                goto out;
                        }
                }
+                add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
+                                 root_objectid);
        } else {
                if (found_extent) {
                        BUG_ON(is_data && refs_to_drop !=
@@ -5713,6 +5898,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           u64 parent, int last_ref)
 {
        struct btrfs_block_group_cache *cache = NULL;
+        int pin = 1;
        int ret;
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -5745,8 +5931,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                btrfs_add_free_space(cache, buf->start, buf->len);
                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+                pin = 0;
        }
 out:
+        if (pin)
+                add_pinned_bytes(root->fs_info, buf->len,
+                                 btrfs_header_level(buf),
+                                 root->root_key.objectid);
        /*
         * Deleting the buffer, clear the corrupt flag since it doesn't matter
         * anymore.
@@ -5763,6 +5955,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
+        add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
        /*
         * tree log blocks never actually go into the extent allocation
         * tree, just update pinning info and exit early.
@@ -6560,52 +6754,26 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 {
        int ret;
        struct btrfs_block_group_cache *block_group;
-        struct btrfs_caching_control *caching_ctl;
-        u64 start = ins->objectid;
-        u64 num_bytes = ins->offset;
-        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-        cache_block_group(block_group, 0);
-        caching_ctl = get_caching_control(block_group);
-        if (!caching_ctl) {
-                BUG_ON(!block_group_cache_done(block_group));
-                ret = btrfs_remove_free_space(block_group, start, num_bytes);
-                if (ret)
-                        goto out;
-        } else {
-                mutex_lock(&caching_ctl->mutex);
-                if (start >= caching_ctl->progress) {
-                        ret = add_excluded_extent(root, start, num_bytes);
-                } else if (start + num_bytes <= caching_ctl->progress) {
-                        ret = btrfs_remove_free_space(block_group,
-                                                      start, num_bytes);
-                } else {
-                        num_bytes = caching_ctl->progress - start;
-                        ret = btrfs_remove_free_space(block_group,
-                                                      start, num_bytes);
-                        if (ret)
-                                goto out_lock;
-                        start = caching_ctl->progress;
+        /*
-                        num_bytes = ins->objectid + ins->offset -
+         * Mixed block groups will exclude before processing the log so we only
-                                    caching_ctl->progress;
+         * need to do the exlude dance if this fs isn't mixed.
-                        ret = add_excluded_extent(root, start, num_bytes);
+         */
-                }
+        if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
-out_lock:
+                ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
-                mutex_unlock(&caching_ctl->mutex);
-                put_caching_control(caching_ctl);
                if (ret)
-                        goto out;
+                        return ret;
        }
+        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+        if (!block_group)
+                return -EINVAL;
        ret = btrfs_update_reserved_bytes(block_group, ins->offset,
                                          RESERVE_ALLOC_NO_ACCOUNT);
        BUG_ON(ret); /* logic error */
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
                                         0, owner, offset, ins, 1);
-out:
        btrfs_put_block_group(block_group);
        return ret;
 }
@@ -7298,6 +7466,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        int err = 0;
        int ret;
        int level;
+        bool root_dropped = false;
        path = btrfs_alloc_path();
        if (!path) {
@@ -7355,6 +7524,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                while (1) {
                        btrfs_tree_lock(path->nodes[level]);
                        btrfs_set_lock_blocking(path->nodes[level]);
+                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                        ret = btrfs_lookup_extent_info(trans, root,
                                                path->nodes[level]->start,
@@ -7370,6 +7540,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                break;
                        btrfs_tree_unlock(path->nodes[level]);
+                        path->locks[level] = 0;
                        WARN_ON(wc->refs[level] != 1);
                        level--;
                }
@@ -7384,11 +7555,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
        while (1) {
-                if (!for_reloc && btrfs_fs_closing(root->fs_info)) {
-                        pr_debug("btrfs: drop snapshot early exit\n");
-                        err = -EAGAIN;
-                        goto out_end_trans;
-                }
                ret = walk_down_tree(trans, root, path, wc);
                if (ret < 0) {
@@ -7416,7 +7582,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                }
                BUG_ON(wc->level == 0);
-                if (btrfs_should_end_transaction(trans, tree_root)) {
+                if (btrfs_should_end_transaction(trans, tree_root) ||
+                    (!for_reloc && btrfs_need_cleaner_sleep(root))) {
                        ret = btrfs_update_root(trans, tree_root,
                                                &root->root_key,
                                                root_item);
@@ -7427,6 +7594,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                        }
                        btrfs_end_transaction_throttle(trans, tree_root);
+                        if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
+                                pr_debug("btrfs: drop snapshot early exit\n");
+                                err = -EAGAIN;
+                                goto out_free;
+                        }
                        trans = btrfs_start_transaction(tree_root, 0);
                        if (IS_ERR(trans)) {
                                err = PTR_ERR(trans);
@@ -7447,8 +7620,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        }
        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-                ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
+                ret = btrfs_find_root(tree_root, &root->root_key, path,
-                                           NULL, NULL);
+                                      NULL, NULL);
                if (ret < 0) {
                        btrfs_abort_transaction(trans, tree_root, ret);
                        err = ret;
@@ -7465,18 +7638,28 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        }
        if (root->in_radix) {
-                btrfs_free_fs_root(tree_root->fs_info, root);
+                btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
        } else {
                free_extent_buffer(root->node);
                free_extent_buffer(root->commit_root);
-                kfree(root);
+                btrfs_put_fs_root(root);
        }
+        root_dropped = true;
 out_end_trans:
        btrfs_end_transaction_throttle(trans, tree_root);
 out_free:
        kfree(wc);
        btrfs_free_path(path);
 out:
+        /*
+         * So if we need to stop dropping the snapshot for whatever reason we
+         * need to make sure to add it back to the dead root list so that we
+         * keep trying to do the work later.  This also cleans up roots if we
+         * don't have it in the radix (like when we recover after a power fail
+         * or unmount) so we don't leak memory.
+         */
+        if (root_dropped == false)
+                btrfs_add_dead_root(root);
        if (err)
                btrfs_std_error(root->fs_info, err);
        return err;
@@ -7782,6 +7965,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        struct btrfs_space_info *space_info;
        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        struct btrfs_device *device;
+        struct btrfs_trans_handle *trans;
        u64 min_free;
        u64 dev_min = 1;
        u64 dev_nr = 0;
@@ -7868,6 +8052,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                do_div(min_free, dev_min);
        }
+        /* We need to do this so that we can look at pending chunks */
+        trans = btrfs_join_transaction(root);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto out;
+        }
        mutex_lock(&root->fs_info->chunk_mutex);
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
                u64 dev_offset;
@@ -7878,7 +8069,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 */
                if (device->total_bytes > device->bytes_used + min_free &&
                    !device->is_tgtdev_for_dev_replace) {
-                        ret = find_free_dev_extent(device, min_free,
+                        ret = find_free_dev_extent(trans, device, min_free,
                                                   &dev_offset, NULL);
                        if (!ret)
                                dev_nr++;
@@ -7890,6 +8081,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                }
        }
        mutex_unlock(&root->fs_info->chunk_mutex);
+        btrfs_end_transaction(trans, root);
 out:
        btrfs_put_block_group(block_group);
        return ret;
@@ -8032,6 +8224,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                                dump_space_info(space_info, 0, 0);
                        }
                }
+                percpu_counter_destroy(&space_info->total_bytes_pinned);
                list_del(&space_info->list);
                kfree(space_info);
        }
@@ -8254,6 +8447,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                                        sizeof(item));
                if (ret)
                        btrfs_abort_transaction(trans, extent_root, ret);
+                ret = btrfs_finish_chunk_alloc(trans, extent_root,
+                                               key.objectid, key.offset);
+                if (ret)
+                        btrfs_abort_transaction(trans, extent_root, ret);
        }
 }
@@ -8591,8 +8788,15 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
                if (end - start >= range->minlen) {
                        if (!block_group_cache_done(cache)) {
                                ret = cache_block_group(cache, 0);
-                                if (!ret)
+                                if (ret) {
-                                        wait_block_group_cache_done(cache);
+                                        btrfs_put_block_group(cache);
+                                        break;
+                                }
+                                ret = wait_block_group_cache_done(cache);
+                                if (ret) {
+                                        btrfs_put_block_group(cache);
+                                        break;
+                                }
                        }
                        ret = btrfs_trim_block_group(cache,
                                                     &group_trimmed,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e7e7afb4a872..fe443fece851 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -77,10 +77,29 @@ void btrfs_leak_debug_check(void)
                kmem_cache_free(extent_buffer_cache, eb);
        }
 }
+#define btrfs_debug_check_extent_io_range(inode, start, end)            \
+        __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+                struct inode *inode, u64 start, u64 end)
+{
+        u64 isize = i_size_read(inode);
+        if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+                printk_ratelimited(KERN_DEBUG
+                    "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+                                caller,
+                                (unsigned long long)btrfs_ino(inode),
+                                (unsigned long long)isize,
+                                (unsigned long long)start,
+                                (unsigned long long)end);
+        }
+}
 #else
 #define btrfs_leak_debug_add(new, head) do {} while (0)
 #define btrfs_leak_debug_del(entry)     do {} while (0)
 #define btrfs_leak_debug_check()        do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e)      do {} while (0)
 #endif
 #define BUFFER_LRU_MAX 64
@@ -522,6 +541,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        int err;
        int clear = 0;
+        btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+        if (bits & EXTENT_DELALLOC)
+                bits |= EXTENT_NORESERVE;
        if (delete)
                bits |= ~EXTENT_CTLBITS;
        bits |= EXTENT_FIRST_DELALLOC;
@@ -677,6 +701,8 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        struct extent_state *state;
        struct rb_node *node;
+        btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
        spin_lock(&tree->lock);
 again:
        while (1) {
@@ -769,6 +795,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        u64 last_start;
        u64 last_end;
+        btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
        bits |= EXTENT_FIRST_DELALLOC;
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
@@ -989,6 +1017,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        u64 last_start;
        u64 last_end;
+        btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
                prealloc = alloc_extent_state(mask);
@@ -2450,11 +2480,12 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                struct extent_state *cached = NULL;
                struct extent_state *state;
                struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+                struct inode *inode = page->mapping->host;
                pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
                         "mirror=%lu\n", (u64)bio->bi_sector, err,
                         io_bio->mirror_num);
-                tree = &BTRFS_I(page->mapping->host)->io_tree;
+                tree = &BTRFS_I(inode)->io_tree;
                /* We always issue full-page reads, but if some block
                 * in a page fails to read, blk_update_request() will
@@ -2528,6 +2559,14 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
                if (uptodate) {
+                        loff_t i_size = i_size_read(inode);
+                        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+                        unsigned offset;
+                        /* Zero out the end if this page straddles i_size */
+                        offset = i_size & (PAGE_CACHE_SIZE-1);
+                        if (page->index == end_index && offset)
+                                zero_user_segment(page, offset, PAGE_CACHE_SIZE);
                        SetPageUptodate(page);
                } else {
                        ClearPageUptodate(page);
@@ -2957,7 +2996,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
           (page->index == end_index && !pg_offset)) {
-                page->mapping->a_ops->invalidatepage(page, 0);
+                page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
                unlock_page(page);
                return 0;
        }
@@ -4009,7 +4048,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        }
        while (!end) {
-                u64 offset_in_extent;
+                u64 offset_in_extent = 0;
                /* break if the extent we found is outside the range */
                if (em->start >= max || extent_map_end(em) < off)
@@ -4025,9 +4064,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                /*
                 * record the offset from the start of the extent
-                 * for adjusting the disk offset below
+                 * for adjusting the disk offset below.  Only do this if the
+                 * extent isn't compressed since our in ram offset may be past
+                 * what we have actually allocated on disk.
                 */
-                offset_in_extent = em_start - em->start;
+                if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+                        offset_in_extent = em_start - em->start;
                em_end = extent_map_end(em);
                em_len = em_end - em_start;
                emflags = em->flags;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 41fb81e7ec53..3b8c4e26e1da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@
 #define EXTENT_FIRST_DELALLOC (1 << 12)
 #define EXTENT_NEED_WAIT (1 << 13)
 #define EXTENT_DAMAGED (1 << 14)
+#define EXTENT_NORESERVE (1 << 15)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b193bf324a41..a7bfc9541803 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
                                   sizeof(struct btrfs_ordered_sum)) / \
-                                   sizeof(struct btrfs_sector_sum) * \
+                                   sizeof(u32) * (r)->sectorsize)
-                                   (r)->sectorsize - (r)->sectorsize)
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
@@ -297,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_ordered_sum *sums;
-        struct btrfs_sector_sum *sector_sum;
        struct btrfs_csum_item *item;
        LIST_HEAD(tmplist);
        unsigned long offset;
@@ -368,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
                                      struct btrfs_csum_item);
                while (start < csum_end) {
                        size = min_t(size_t, csum_end - start,
-                                        MAX_ORDERED_SUM_BYTES(root));
+                                     MAX_ORDERED_SUM_BYTES(root));
                        sums = kzalloc(btrfs_ordered_sum_size(root, size),
-                                        GFP_NOFS);
+                                       GFP_NOFS);
                        if (!sums) {
                                ret = -ENOMEM;
                                goto fail;
                        }
-                        sector_sum = sums->sums;
                        sums->bytenr = start;
-                        sums->len = size;
+                        sums->len = (int)size;
                        offset = (start - key.offset) >>
                                root->fs_info->sb->s_blocksize_bits;
                        offset *= csum_size;
+                        size >>= root->fs_info->sb->s_blocksize_bits;
-                        while (size > 0) {
+                        read_extent_buffer(path->nodes[0],
-                                read_extent_buffer(path->nodes[0],
+                                           sums->sums,
-                                                &sector_sum->sum,
+                                           ((unsigned long)item) + offset,
-                                                ((unsigned long)item) +
+                                           csum_size * size);
-                                                offset, csum_size);
-                                sector_sum->bytenr = start;
+                        start += root->sectorsize * size;
-                                size -= root->sectorsize;
-                                start += root->sectorsize;
-                                offset += csum_size;
-                                sector_sum++;
-                        }
                        list_add_tail(&sums->list, &tmplist);
                }
                path->slots[0]++;
@@ -417,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                       struct bio *bio, u64 file_start, int contig)
 {
        struct btrfs_ordered_sum *sums;
-        struct btrfs_sector_sum *sector_sum;
        struct btrfs_ordered_extent *ordered;
        char *data;
        struct bio_vec *bvec = bio->bi_io_vec;
        int bio_index = 0;
+        int index;
        unsigned long total_bytes = 0;
        unsigned long this_sum_bytes = 0;
        u64 offset;
-        u64 disk_bytenr;
        WARN_ON(bio->bi_vcnt <= 0);
        sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
        if (!sums)
                return -ENOMEM;
-        sector_sum = sums->sums;
-        disk_bytenr = (u64)bio->bi_sector << 9;
        sums->len = bio->bi_size;
        INIT_LIST_HEAD(&sums->list);
@@ -444,7 +433,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
        ordered = btrfs_lookup_ordered_extent(inode, offset);
        BUG_ON(!ordered); /* Logic error */
-        sums->bytenr = ordered->start;
+        sums->bytenr = (u64)bio->bi_sector << 9;
+        index = 0;
        while (bio_index < bio->bi_vcnt) {
                if (!contig)
@@ -463,28 +453,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                        sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
                                       GFP_NOFS);
                        BUG_ON(!sums); /* -ENOMEM */
-                        sector_sum = sums->sums;
                        sums->len = bytes_left;
                        ordered = btrfs_lookup_ordered_extent(inode, offset);
                        BUG_ON(!ordered); /* Logic error */
-                        sums->bytenr = ordered->start;
+                        sums->bytenr = ((u64)bio->bi_sector << 9) +
+                                       total_bytes;
+                        index = 0;
                }
                data = kmap_atomic(bvec->bv_page);
-                sector_sum->sum = ~(u32)0;
+                sums->sums[index] = ~(u32)0;
-                sector_sum->sum = btrfs_csum_data(data + bvec->bv_offset,
+                sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
-                                                  sector_sum->sum,
+                                                    sums->sums[index],
-                                                  bvec->bv_len);
+                                                    bvec->bv_len);
                kunmap_atomic(data);
-                btrfs_csum_final(sector_sum->sum,
+                btrfs_csum_final(sums->sums[index],
-                                 (char *)&sector_sum->sum);
+                                 (char *)(sums->sums + index));
-                sector_sum->bytenr = disk_bytenr;
-                sector_sum++;
                bio_index++;
+                index++;
                total_bytes += bvec->bv_len;
                this_sum_bytes += bvec->bv_len;
-                disk_bytenr += bvec->bv_len;
                offset += bvec->bv_len;
                bvec++;
        }
@@ -672,62 +661,46 @@ out:
        return ret;
 }
-static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
-                                 struct btrfs_sector_sum *sector_sum,
-                                 u64 total_bytes, u64 sectorsize)
-{
-        u64 tmp = sectorsize;
-        u64 next_sector = sector_sum->bytenr;
-        struct btrfs_sector_sum *next = sector_sum + 1;
-        while ((tmp + total_bytes) < sums->len) {
-                if (next_sector + sectorsize != next->bytenr)
-                        break;
-                tmp += sectorsize;
-                next_sector = next->bytenr;
-                next++;
-        }
-        return tmp;
-}
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct btrfs_ordered_sum *sums)
 {
-        u64 bytenr;
-        int ret;
        struct btrfs_key file_key;
        struct btrfs_key found_key;
-        u64 next_offset;
-        u64 total_bytes = 0;
-        int found_next;
        struct btrfs_path *path;
        struct btrfs_csum_item *item;
        struct btrfs_csum_item *item_end;
        struct extent_buffer *leaf = NULL;
+        u64 next_offset;
+        u64 total_bytes = 0;
        u64 csum_offset;
-        struct btrfs_sector_sum *sector_sum;
+        u64 bytenr;
        u32 nritems;
        u32 ins_size;
+        int index = 0;
+        int found_next;
+        int ret;
        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        sector_sum = sums->sums;
 again:
        next_offset = (u64)-1;
        found_next = 0;
+        bytenr = sums->bytenr + total_bytes;
        file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
-        file_key.offset = sector_sum->bytenr;
+        file_key.offset = bytenr;
-        bytenr = sector_sum->bytenr;
        btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
-        item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+        item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
        if (!IS_ERR(item)) {
-                leaf = path->nodes[0];
                ret = 0;
+                leaf = path->nodes[0];
+                item_end = btrfs_item_ptr(leaf, path->slots[0],
+                                          struct btrfs_csum_item);
+                item_end = (struct btrfs_csum_item *)((char *)item_end +
+                           btrfs_item_size_nr(leaf, path->slots[0]));
                goto found;
        }
        ret = PTR_ERR(item);
@@ -807,8 +780,7 @@ again:
                free_space = btrfs_leaf_free_space(root, leaf) -
                                         sizeof(struct btrfs_item) - csum_size;
-                tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
+                tmp = sums->len - total_bytes;
-                                            root->sectorsize);
                tmp >>= root->fs_info->sb->s_blocksize_bits;
                WARN_ON(tmp < 1);
@@ -822,6 +794,7 @@ again:
                diff *= csum_size;
                btrfs_extend_item(root, path, diff);
+                ret = 0;
                goto csum;
        }
@@ -831,8 +804,7 @@ insert:
        if (found_next) {
                u64 tmp;
-                tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
+                tmp = sums->len - total_bytes;
-                                            root->sectorsize);
                tmp >>= root->fs_info->sb->s_blocksize_bits;
                tmp = min(tmp, (next_offset - file_key.offset) >>
                                         root->fs_info->sb->s_blocksize_bits);
@@ -853,31 +825,25 @@ insert:
                WARN_ON(1);
                goto fail_unlock;
        }
-csum:
        leaf = path->nodes[0];
+csum:
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
-        ret = 0;
+        item_end = (struct btrfs_csum_item *)((unsigned char *)item +
+                                      btrfs_item_size_nr(leaf, path->slots[0]));
        item = (struct btrfs_csum_item *)((unsigned char *)item +
                                          csum_offset * csum_size);
 found:
-        item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+        ins_size = (u32)(sums->len - total_bytes) >>
-        item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
+                   root->fs_info->sb->s_blocksize_bits;
-                                      btrfs_item_size_nr(leaf, path->slots[0]));
+        ins_size *= csum_size;
-next_sector:
+        ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
+                              ins_size);
-        write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size);
+        write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
+                            ins_size);
-        total_bytes += root->sectorsize;
-        sector_sum++;
+        ins_size /= csum_size;
-        if (total_bytes < sums->len) {
+        total_bytes += ins_size * root->sectorsize;
-                item = (struct btrfs_csum_item *)((char *)item +
+        index += ins_size;
-                                                  csum_size);
-                if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
-                    sector_sum->bytenr) {
-                        bytenr = sector_sum->bytenr;
-                        goto next_sector;
-                }
-        }
        btrfs_mark_buffer_dirty(path->nodes[0]);
        if (total_bytes < sums->len) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4205ba752d40..8e686a427ce2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -309,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
                ret = PTR_ERR(inode_root);
                goto cleanup;
        }
-        if (btrfs_root_refs(&inode_root->root_item) == 0) {
-                ret = -ENOENT;
-                goto cleanup;
-        }
        key.objectid = defrag->ino;
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -600,20 +596,29 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                if (no_splits)
                        goto next;
-                if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+                if (em->start < start) {
-                    em->start < start) {
                        split->start = em->start;
                        split->len = start - em->start;
-                        split->orig_start = em->orig_start;
-                        split->block_start = em->block_start;
-                        if (compressed)
+                        if (em->block_start < EXTENT_MAP_LAST_BYTE) {
-                                split->block_len = em->block_len;
+                                split->orig_start = em->orig_start;
-                        else
+                                split->block_start = em->block_start;
-                                split->block_len = split->len;
-                        split->ram_bytes = em->ram_bytes;
+                                if (compressed)
-                        split->orig_block_len = max(split->block_len,
+                                        split->block_len = em->block_len;
-                                                    em->orig_block_len);
+                                else
+                                        split->block_len = split->len;
+                                split->orig_block_len = max(split->block_len,
+                                                em->orig_block_len);
+                                split->ram_bytes = em->ram_bytes;
+                        } else {
+                                split->orig_start = split->start;
+                                split->block_len = 0;
+                                split->block_start = em->block_start;
+                                split->orig_block_len = 0;
+                                split->ram_bytes = split->len;
+                        }
                        split->generation = gen;
                        split->bdev = em->bdev;
                        split->flags = flags;
@@ -624,8 +629,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split = split2;
                        split2 = NULL;
                }
-                if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+                if (testend && em->start + em->len > start + len) {
-                    testend && em->start + em->len > start + len) {
                        u64 diff = start + len - em->start;
                        split->start = start + len;
@@ -634,18 +638,28 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->flags = flags;
                        split->compress_type = em->compress_type;
                        split->generation = gen;
-                        split->orig_block_len = max(em->block_len,
+                        if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+                                split->orig_block_len = max(em->block_len,
                                                    em->orig_block_len);
-                        split->ram_bytes = em->ram_bytes;
-                        if (compressed) {
+                                split->ram_bytes = em->ram_bytes;
-                                split->block_len = em->block_len;
+                                if (compressed) {
-                                split->block_start = em->block_start;
+                                        split->block_len = em->block_len;
-                                split->orig_start = em->orig_start;
+                                        split->block_start = em->block_start;
+                                        split->orig_start = em->orig_start;
+                                } else {
+                                        split->block_len = split->len;
+                                        split->block_start = em->block_start
+                                                + diff;
+                                        split->orig_start = em->orig_start;
+                                }
                        } else {
-                                split->block_len = split->len;
+                                split->ram_bytes = split->len;
-                                split->block_start = em->block_start + diff;
+                                split->orig_start = split->start;
-                                split->orig_start = em->orig_start;
+                                split->block_len = 0;
+                                split->block_start = em->block_start;
+                                split->orig_block_len = 0;
                        }
                        ret = add_extent_mapping(em_tree, split, modified);
@@ -1317,6 +1331,56 @@ fail:
 }
+static noinline int check_can_nocow(struct inode *inode, loff_t pos,
+                                    size_t *write_bytes)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_ordered_extent *ordered;
+        u64 lockstart, lockend;
+        u64 num_bytes;
+        int ret;
+        lockstart = round_down(pos, root->sectorsize);
+        lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+        while (1) {
+                lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+                ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                     lockend - lockstart + 1);
+                if (!ordered) {
+                        break;
+                }
+                unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                btrfs_put_ordered_extent(ordered);
+        }
+        trans = btrfs_join_transaction(root);
+        if (IS_ERR(trans)) {
+                unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+                return PTR_ERR(trans);
+        }
+        num_bytes = lockend - lockstart + 1;
+        ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
+                               NULL);
+        btrfs_end_transaction(trans, root);
+        if (ret <= 0) {
+                ret = 0;
+        } else {
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                 EXTENT_DIRTY | EXTENT_DELALLOC |
+                                 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+                                 NULL, GFP_NOFS);
+                *write_bytes = min_t(size_t, *write_bytes, num_bytes);
+        }
+        unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+        return ret;
+}
 static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                               struct iov_iter *i,
                                               loff_t pos)
@@ -1324,10 +1388,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct page **pages = NULL;
+        u64 release_bytes = 0;
        unsigned long first_index;
        size_t num_written = 0;
        int nrptrs;
        int ret = 0;
+        bool only_release_metadata = false;
        bool force_page_uptodate = false;
        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
@@ -1348,6 +1414,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                         offset);
                size_t num_pages = (write_bytes + offset +
                                    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                size_t reserve_bytes;
                size_t dirty_pages;
                size_t copied;
@@ -1362,11 +1429,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                        break;
                }
-                ret = btrfs_delalloc_reserve_space(inode,
+                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-                                        num_pages << PAGE_CACHE_SHIFT);
+                ret = btrfs_check_data_free_space(inode, reserve_bytes);
+                if (ret == -ENOSPC &&
+                    (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+                                              BTRFS_INODE_PREALLOC))) {
+                        ret = check_can_nocow(inode, pos, &write_bytes);
+                        if (ret > 0) {
+                                only_release_metadata = true;
+                                /*
+                                 * our prealloc extent may be smaller than
+                                 * write_bytes, so scale down.
+                                 */
+                                num_pages = (write_bytes + offset +
+                                             PAGE_CACHE_SIZE - 1) >>
+                                        PAGE_CACHE_SHIFT;
+                                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+                                ret = 0;
+                        } else {
+                                ret = -ENOSPC;
+                        }
+                }
                if (ret)
                        break;
+                ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
+                if (ret) {
+                        if (!only_release_metadata)
+                                btrfs_free_reserved_data_space(inode,
+                                                               reserve_bytes);
+                        break;
+                }
+                release_bytes = reserve_bytes;
                /*
                 * This is going to setup the pages array with the number of
                 * pages we want, so we don't really need to worry about the
@@ -1375,11 +1472,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                ret = prepare_pages(root, file, pages, num_pages,
                                    pos, first_index, write_bytes,
                                    force_page_uptodate);
-                if (ret) {
+                if (ret)
-                        btrfs_delalloc_release_space(inode,
-                                        num_pages << PAGE_CACHE_SHIFT);
                        break;
-                }
                copied = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, i);
@@ -1409,30 +1503,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                 * managed to copy.
                 */
                if (num_pages > dirty_pages) {
+                        release_bytes = (num_pages - dirty_pages) <<
+                                PAGE_CACHE_SHIFT;
                        if (copied > 0) {
                                spin_lock(&BTRFS_I(inode)->lock);
                                BTRFS_I(inode)->outstanding_extents++;
                                spin_unlock(&BTRFS_I(inode)->lock);
                        }
-                        btrfs_delalloc_release_space(inode,
+                        if (only_release_metadata)
-                                        (num_pages - dirty_pages) <<
+                                btrfs_delalloc_release_metadata(inode,
-                                        PAGE_CACHE_SHIFT);
+                                                                release_bytes);
+                        else
+                                btrfs_delalloc_release_space(inode,
+                                                             release_bytes);
                }
+                release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
                if (copied > 0) {
                        ret = btrfs_dirty_pages(root, inode, pages,
                                                dirty_pages, pos, copied,
                                                NULL);
                        if (ret) {
-                                btrfs_delalloc_release_space(inode,
-                                        dirty_pages << PAGE_CACHE_SHIFT);
                                btrfs_drop_pages(pages, num_pages);
                                break;
                        }
                }
+                release_bytes = 0;
                btrfs_drop_pages(pages, num_pages);
+                if (only_release_metadata && copied > 0) {
+                        u64 lockstart = round_down(pos, root->sectorsize);
+                        u64 lockend = lockstart +
+                                (dirty_pages << PAGE_CACHE_SHIFT) - 1;
+                        set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                       lockend, EXTENT_NORESERVE, NULL,
+                                       NULL, GFP_NOFS);
+                        only_release_metadata = false;
+                }
                cond_resched();
                balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1445,6 +1555,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        kfree(pages);
+        if (release_bytes) {
+                if (only_release_metadata)
+                        btrfs_delalloc_release_metadata(inode, release_bytes);
+                else
+                        btrfs_delalloc_release_space(inode, release_bytes);
+        }
        return num_written ? num_written : ret;
 }
@@ -2175,12 +2292,6 @@ static long btrfs_fallocate(struct file *file, int mode,
                        goto out_reserve_fail;
        }
-        /*
-         * wait for ordered IO before we have any locks.  We'll loop again
-         * below with the locks held.
-         */
-        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
        mutex_lock(&inode->i_mutex);
        ret = inode_newsize_ok(inode, alloc_end);
        if (ret)
@@ -2191,8 +2302,23 @@ static long btrfs_fallocate(struct file *file, int mode,
                                        alloc_start);
                if (ret)
                        goto out;
+        } else {
+                /*
+                 * If we are fallocating from the end of the file onward we
+                 * need to zero out the end of the page if i_size lands in the
+                 * middle of a page.
+                 */
+                ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+                if (ret)
+                        goto out;
        }
+        /*
+         * wait for ordered IO before we have any locks.  We'll loop again
+         * below with the locks held.
+         */
+        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
        locked_end = alloc_end - 1;
        while (1) {
                struct btrfs_ordered_extent *ordered;
@@ -2425,20 +2551,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
                }
        }
-        if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
+        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-                offset = -EINVAL;
-                goto out;
-        }
-        if (offset > inode->i_sb->s_maxbytes) {
-                offset = -EINVAL;
-                goto out;
-        }
-        /* Special lock needed here? */
-        if (offset != file->f_pos) {
-                file->f_pos = offset;
-                file->f_version = 0;
-        }
 out:
        mutex_unlock(&inode->i_mutex);
        return offset;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e53009657f0e..b21a3cd667d8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -213,7 +213,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
        else
                ret = 0;
        spin_unlock(&rsv->lock);
-        return 0;
+        return ret;
 }
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
@@ -3150,6 +3150,8 @@ again:
        return 0;
 }
+#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__)
 /*
 * This test just does basic sanity checking, making sure we can add an exten
 * entry and remove space from either end and the middle, and make sure we can
@@ -3159,63 +3161,63 @@ static int test_extents(struct btrfs_block_group_cache *cache)
 {
        int ret = 0;
-        printk(KERN_ERR "Running extent only tests\n");
+        test_msg("Running extent only tests\n");
        /* First just make sure we can remove an entire entry */
        ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Error adding initial extents %d\n", ret);
+                test_msg("Error adding initial extents %d\n", ret);
                return ret;
        }
        ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Error removing extent %d\n", ret);
+                test_msg("Error removing extent %d\n", ret);
                return ret;
        }
        if (check_exists(cache, 0, 4 * 1024 * 1024)) {
-                printk(KERN_ERR "Full remove left some lingering space\n");
+                test_msg("Full remove left some lingering space\n");
                return -1;
        }
        /* Ok edge and middle cases now */
        ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Error adding half extent %d\n", ret);
+                test_msg("Error adding half extent %d\n", ret);
                return ret;
        }
        ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Error removing tail end %d\n", ret);
+                test_msg("Error removing tail end %d\n", ret);
                return ret;
        }
        ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Error removing front end %d\n", ret);
+                test_msg("Error removing front end %d\n", ret);
                return ret;
        }
        ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
        if (ret) {
-                printk(KERN_ERR "Error removing middle peice %d\n", ret);
+                test_msg("Error removing middle piece %d\n", ret);
                return ret;
        }
        if (check_exists(cache, 0, 1 * 1024 * 1024)) {
-                printk(KERN_ERR "Still have space at the front\n");
+                test_msg("Still have space at the front\n");
                return -1;
        }
        if (check_exists(cache, 2 * 1024 * 1024, 4096)) {
-                printk(KERN_ERR "Still have space in the middle\n");
+                test_msg("Still have space in the middle\n");
                return -1;
        }
        if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
-                printk(KERN_ERR "Still have space at the end\n");
+                test_msg("Still have space at the end\n");
                return -1;
        }
@@ -3230,34 +3232,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
        u64 next_bitmap_offset;
        int ret;
-        printk(KERN_ERR "Running bitmap only tests\n");
+        test_msg("Running bitmap only tests\n");
        ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
        if (ret) {
-                printk(KERN_ERR "Couldn't create a bitmap entry %d\n", ret);
+                test_msg("Couldn't create a bitmap entry %d\n", ret);
                return ret;
        }
        ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Error removing bitmap full range %d\n", ret);
+                test_msg("Error removing bitmap full range %d\n", ret);
                return ret;
        }
        if (check_exists(cache, 0, 4 * 1024 * 1024)) {
-                printk(KERN_ERR "Left some space in bitmap\n");
+                test_msg("Left some space in bitmap\n");
                return -1;
        }
        ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
        if (ret) {
-                printk(KERN_ERR "Couldn't add to our bitmap entry %d\n", ret);
+                test_msg("Couldn't add to our bitmap entry %d\n", ret);
                return ret;
        }
        ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Couldn't remove middle chunk %d\n", ret);
+                test_msg("Couldn't remove middle chunk %d\n", ret);
                return ret;
        }
@@ -3271,21 +3273,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
        ret = add_free_space_entry(cache, next_bitmap_offset -
                                   (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
        if (ret) {
-                printk(KERN_ERR "Couldn't add space that straddles two bitmaps"
+                test_msg("Couldn't add space that straddles two bitmaps %d\n",
-                       " %d\n", ret);
+                                ret);
                return ret;
        }
        ret = btrfs_remove_free_space(cache, next_bitmap_offset -
                                      (1 * 1024 * 1024), 2 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret);
+                test_msg("Couldn't remove overlapping space %d\n", ret);
                return ret;
        }
        if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
                         2 * 1024 * 1024)) {
-                printk(KERN_ERR "Left some space when removing overlapping\n");
+                test_msg("Left some space when removing overlapping\n");
                return -1;
        }
@@ -3300,7 +3302,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
        u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
        int ret;
-        printk(KERN_ERR "Running bitmap and extent tests\n");
+        test_msg("Running bitmap and extent tests\n");
        /*
         * First let's do something simple, an extent at the same offset as the
@@ -3309,42 +3311,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
         */
        ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
        if (ret) {
-                printk(KERN_ERR "Couldn't create bitmap entry %d\n", ret);
+                test_msg("Couldn't create bitmap entry %d\n", ret);
                return ret;
        }
        ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
        if (ret) {
-                printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+                test_msg("Couldn't add extent entry %d\n", ret);
                return ret;
        }
        ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Couldn't remove extent entry %d\n", ret);
+                test_msg("Couldn't remove extent entry %d\n", ret);
                return ret;
        }
        if (check_exists(cache, 0, 1 * 1024 * 1024)) {
-                printk(KERN_ERR "Left remnants after our remove\n");
+                test_msg("Left remnants after our remove\n");
                return -1;
        }
        /* Now to add back the extent entry and remove from the bitmap */
        ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
        if (ret) {
-                printk(KERN_ERR "Couldn't re-add extent entry %d\n", ret);
+                test_msg("Couldn't re-add extent entry %d\n", ret);
                return ret;
        }
        ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Couldn't remove from bitmap %d\n", ret);
+                test_msg("Couldn't remove from bitmap %d\n", ret);
                return ret;
        }
        if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
-                printk(KERN_ERR "Left remnants in the bitmap\n");
+                test_msg("Left remnants in the bitmap\n");
                return -1;
        }
@@ -3354,19 +3356,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
         */
        ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
        if (ret) {
-                printk(KERN_ERR "Couldn't add to a bitmap %d\n", ret);
+                test_msg("Couldn't add to a bitmap %d\n", ret);
                return ret;
        }
        ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret);
+                test_msg("Couldn't remove overlapping space %d\n", ret);
                return ret;
        }
        if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
-                printk(KERN_ERR "Left over peices after removing "
+                test_msg("Left over peices after removing overlapping\n");
-                       "overlapping\n");
                return -1;
        }
@@ -3375,24 +3376,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
        /* Now with the extent entry offset into the bitmap */
        ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
        if (ret) {
-                printk(KERN_ERR "Couldn't add space to the bitmap %d\n", ret);
+                test_msg("Couldn't add space to the bitmap %d\n", ret);
                return ret;
        }
        ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
        if (ret) {
-                printk(KERN_ERR "Couldn't add extent to the cache %d\n", ret);
+                test_msg("Couldn't add extent to the cache %d\n", ret);
                return ret;
        }
        ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Problem removing overlapping space %d\n", ret);
+                test_msg("Problem removing overlapping space %d\n", ret);
                return ret;
        }
        if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
-                printk(KERN_ERR "Left something behind when removing space");
+                test_msg("Left something behind when removing space");
                return -1;
        }
@@ -3410,27 +3411,27 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
        ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
                                   4 * 1024 * 1024, 1);
        if (ret) {
-                printk(KERN_ERR "Couldn't add bitmap %d\n", ret);
+                test_msg("Couldn't add bitmap %d\n", ret);
                return ret;
        }
        ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
                                   5 * 1024 * 1024, 0);
        if (ret) {
-                printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+                test_msg("Couldn't add extent entry %d\n", ret);
                return ret;
        }
        ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
                                      5 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Failed to free our space %d\n", ret);
+                test_msg("Failed to free our space %d\n", ret);
                return ret;
        }
        if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
                         5 * 1024 * 1024)) {
-                printk(KERN_ERR "Left stuff over\n");
+                test_msg("Left stuff over\n");
                return -1;
        }
@@ -3444,20 +3445,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
         */
        ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
        if (ret) {
-                printk(KERN_ERR "Couldn't add bitmap entry %d\n", ret);
+                test_msg("Couldn't add bitmap entry %d\n", ret);
                return ret;
        }
        ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
        if (ret) {
-                printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+                test_msg("Couldn't add extent entry %d\n", ret);
                return ret;
        }
        ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
        if (ret) {
-                printk(KERN_ERR "Error removing bitmap and extent "
+                test_msg("Error removing bitmap and extent overlapping %d\n", ret);
-                       "overlapping %d\n", ret);
                return ret;
        }
@@ -3469,11 +3469,11 @@ void btrfs_test_free_space_cache(void)
 {
        struct btrfs_block_group_cache *cache;
-        printk(KERN_ERR "Running btrfs free space cache tests\n");
+        test_msg("Running btrfs free space cache tests\n");
        cache = init_test_block_group();
        if (!cache) {
-                printk(KERN_ERR "Couldn't run the tests\n");
+                test_msg("Couldn't run the tests\n");
                return;
        }
@@ -3487,6 +3487,9 @@ out:
        __btrfs_remove_free_space_cache(cache->free_space_ctl);
        kfree(cache->free_space_ctl);
        kfree(cache);
-        printk(KERN_ERR "Free space cache tests finished\n");
+        test_msg("Free space cache tests finished\n");
 }
-#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
+#undef test_msg
+#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
+void btrfs_test_free_space_cache(void) {}
+#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 8b7f19f44961..894116b71304 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -113,8 +113,6 @@ int btrfs_return_cluster_to_free_space(
 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
                           u64 *trimmed, u64 start, u64 end, u64 minlen);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 void btrfs_test_free_space_cache(void);
-#endif
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17f3064b4a3e..021694c08181 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -42,6 +42,7 @@
 #include <linux/mount.h>
 #include <linux/btrfs.h>
 #include <linux/blkdev.h>
+#include <linux/posix_acl_xattr.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -57,6 +58,7 @@
 #include "free-space-cache.h"
 #include "inode-map.h"
 #include "backref.h"
+#include "hash.h"
 struct btrfs_iget_args {
        u64 ino;
@@ -701,8 +703,12 @@ retry:
                        async_extent->nr_pages = 0;
                        async_extent->pages = NULL;
-                        if (ret == -ENOSPC)
+                        if (ret == -ENOSPC) {
+                                unlock_extent(io_tree, async_extent->start,
+                                              async_extent->start +
+                                              async_extent->ram_size - 1);
                                goto retry;
+                        }
                        goto out_free;
                }
@@ -1529,6 +1535,46 @@ static void btrfs_merge_extent_hook(struct inode *inode,
        spin_unlock(&BTRFS_I(inode)->lock);
 }
+static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+                                      struct inode *inode)
+{
+        spin_lock(&root->delalloc_lock);
+        if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+                              &root->delalloc_inodes);
+                set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                        &BTRFS_I(inode)->runtime_flags);
+                root->nr_delalloc_inodes++;
+                if (root->nr_delalloc_inodes == 1) {
+                        spin_lock(&root->fs_info->delalloc_root_lock);
+                        BUG_ON(!list_empty(&root->delalloc_root));
+                        list_add_tail(&root->delalloc_root,
+                                      &root->fs_info->delalloc_roots);
+                        spin_unlock(&root->fs_info->delalloc_root_lock);
+                }
+        }
+        spin_unlock(&root->delalloc_lock);
+}
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+                                     struct inode *inode)
+{
+        spin_lock(&root->delalloc_lock);
+        if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                          &BTRFS_I(inode)->runtime_flags);
+                root->nr_delalloc_inodes--;
+                if (!root->nr_delalloc_inodes) {
+                        spin_lock(&root->fs_info->delalloc_root_lock);
+                        BUG_ON(list_empty(&root->delalloc_root));
+                        list_del_init(&root->delalloc_root);
+                        spin_unlock(&root->fs_info->delalloc_root_lock);
+                }
+        }
+        spin_unlock(&root->delalloc_lock);
+}
 /*
 * extent_io.c set_bit_hook, used to track delayed allocation
 * bytes in this file, and to maintain the list of inodes that
@@ -1561,16 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
                spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes += len;
                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                         &BTRFS_I(inode)->runtime_flags)) {
+                                         &BTRFS_I(inode)->runtime_flags))
-                        spin_lock(&root->fs_info->delalloc_lock);
+                        btrfs_add_delalloc_inodes(root, inode);
-                        if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                                list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
-                                              &root->fs_info->delalloc_inodes);
-                                set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                        &BTRFS_I(inode)->runtime_flags);
-                        }
-                        spin_unlock(&root->fs_info->delalloc_lock);
-                }
                spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
@@ -1604,7 +1642,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                        btrfs_delalloc_release_metadata(inode, len);
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
-                    && do_list)
+                    && do_list && !(state->state & EXTENT_NORESERVE))
                        btrfs_free_reserved_data_space(inode, len);
                __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
@@ -1613,15 +1651,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                BTRFS_I(inode)->delalloc_bytes -= len;
                if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
                    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                             &BTRFS_I(inode)->runtime_flags)) {
+                             &BTRFS_I(inode)->runtime_flags))
-                        spin_lock(&root->fs_info->delalloc_lock);
+                        btrfs_del_delalloc_inode(root, inode);
-                        if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                                list_del_init(&BTRFS_I(inode)->delalloc_inodes);
-                                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                          &BTRFS_I(inode)->runtime_flags);
-                        }
-                        spin_unlock(&root->fs_info->delalloc_lock);
-                }
                spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
@@ -2135,16 +2166,23 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
                if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
                        continue;
-                extent_offset = btrfs_file_extent_offset(leaf, extent);
+                /*
-                if (key.offset - extent_offset != offset)
+                 * 'offset' refers to the exact key.offset,
+                 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
+                 * (key.offset - extent_offset).
+                 */
+                if (key.offset != offset)
                        continue;
+                extent_offset = btrfs_file_extent_offset(leaf, extent);
                num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
                if (extent_offset >= old->extent_offset + old->offset +
                    old->len || extent_offset + num_bytes <=
                    old->extent_offset + old->offset)
                        continue;
+                ret = 0;
                break;
        }
@@ -2156,7 +2194,7 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
        backref->root_id = root_id;
        backref->inum = inum;
-        backref->file_pos = offset + extent_offset;
+        backref->file_pos = offset;
        backref->num_bytes = num_bytes;
        backref->extent_offset = extent_offset;
        backref->generation = btrfs_file_extent_generation(leaf, extent);
@@ -2179,7 +2217,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path,
        new->path = path;
        list_for_each_entry_safe(old, tmp, &new->head, list) {
-                ret = iterate_inodes_from_logical(old->bytenr, fs_info,
+                ret = iterate_inodes_from_logical(old->bytenr +
+                                                  old->extent_offset, fs_info,
                                                  path, record_one_backref,
                                                  old);
                BUG_ON(ret < 0 && ret != -ENOENT);
@@ -2263,11 +2302,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
                        return 0;
                return PTR_ERR(root);
        }
-        if (btrfs_root_refs(&root->root_item) == 0) {
-                srcu_read_unlock(&fs_info->subvol_srcu, index);
-                /* parse ENOENT to 0 */
-                return 0;
-        }
        /* step 2: get inode */
        key.objectid = backref->inum;
@@ -3215,13 +3249,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                        /* 1 for the orphan item deletion. */
                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
+                                iput(inode);
                                ret = PTR_ERR(trans);
                                goto out;
                        }
                        ret = btrfs_orphan_add(trans, inode);
                        btrfs_end_transaction(trans, root);
-                        if (ret)
+                        if (ret) {
+                                iput(inode);
                                goto out;
+                        }
                        ret = btrfs_truncate(inode);
                        if (ret)
@@ -3274,8 +3311,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 {
        u32 nritems = btrfs_header_nritems(leaf);
        struct btrfs_key found_key;
+        static u64 xattr_access = 0;
+        static u64 xattr_default = 0;
        int scanned = 0;
+        if (!xattr_access) {
+                xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
+                                        strlen(POSIX_ACL_XATTR_ACCESS));
+                xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
+                                        strlen(POSIX_ACL_XATTR_DEFAULT));
+        }
        slot++;
        while (slot < nritems) {
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -3285,8 +3331,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
                        return 0;
                /* we found an xattr, assume we've got an acl */
-                if (found_key.type == BTRFS_XATTR_ITEM_KEY)
+                if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
-                        return 1;
+                        if (found_key.offset == xattr_access ||
+                            found_key.offset == xattr_default)
+                                return 1;
+                }
                /*
                 * we found a key greater than an xattr key, there can't
@@ -3660,53 +3709,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        }
        return ret;
 }
-                
-/* helper to check if there is any shared block in the path */
-static int check_path_shared(struct btrfs_root *root,
-                             struct btrfs_path *path)
-{
-        struct extent_buffer *eb;
-        int level;
-        u64 refs = 1;
-        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
-                int ret;
-                if (!path->nodes[level])
-                        break;
-                eb = path->nodes[level];
-                if (!btrfs_block_can_be_shared(root, eb))
-                        continue;
-                ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1,
-                                               &refs, NULL);
-                if (refs > 1)
-                        return 1;
-        }
-        return 0;
-}
 /*
 * helper to start transaction for unlink and rmdir.
 *
- * unlink and rmdir are special in btrfs, they do not always free space.
+ * unlink and rmdir are special in btrfs, they do not always free space, so
- * so in enospc case, we should make sure they will free space before
+ * if we cannot make our reservations the normal way try and see if there is
- * allowing them to use the global metadata reservation.
+ * plenty of slack room in the global reserve to migrate, otherwise we cannot
+ * allow the unlink to occur.
 */
-static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
-                                                       struct dentry *dentry)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
-        struct btrfs_path *path;
-        struct btrfs_dir_item *di;
-        struct inode *inode = dentry->d_inode;
-        u64 index;
-        int check_link = 1;
-        int err = -ENOSPC;
        int ret;
-        u64 ino = btrfs_ino(inode);
-        u64 dir_ino = btrfs_ino(dir);
        /*
         * 1 for the possible orphan item
@@ -3719,158 +3735,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
                return trans;
-        if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+        if (PTR_ERR(trans) == -ENOSPC) {
-                return ERR_PTR(-ENOSPC);
+                u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
-        /* check if there is someone else holds reference */
+                trans = btrfs_start_transaction(root, 0);
-        if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
+                if (IS_ERR(trans))
-                return ERR_PTR(-ENOSPC);
+                        return trans;
+                ret = btrfs_cond_migrate_bytes(root->fs_info,
-        if (atomic_read(&inode->i_count) > 2)
+                                               &root->fs_info->trans_block_rsv,
-                return ERR_PTR(-ENOSPC);
+                                               num_bytes, 5);
+                if (ret) {
-        if (xchg(&root->fs_info->enospc_unlink, 1))
+                        btrfs_end_transaction(trans, root);
-                return ERR_PTR(-ENOSPC);
+                        return ERR_PTR(ret);
-        path = btrfs_alloc_path();
-        if (!path) {
-                root->fs_info->enospc_unlink = 0;
-                return ERR_PTR(-ENOMEM);
-        }
-        /* 1 for the orphan item */
-        trans = btrfs_start_transaction(root, 1);
-        if (IS_ERR(trans)) {
-                btrfs_free_path(path);
-                root->fs_info->enospc_unlink = 0;
-                return trans;
-        }
-        path->skip_locking = 1;
-        path->search_commit_root = 1;
-        ret = btrfs_lookup_inode(trans, root, path,
-                                &BTRFS_I(dir)->location, 0);
-        if (ret < 0) {
-                err = ret;
-                goto out;
-        }
-        if (ret == 0) {
-                if (check_path_shared(root, path))
-                        goto out;
-        } else {
-                check_link = 0;
-        }
-        btrfs_release_path(path);
-        ret = btrfs_lookup_inode(trans, root, path,
-                                &BTRFS_I(inode)->location, 0);
-        if (ret < 0) {
-                err = ret;
-                goto out;
-        }
-        if (ret == 0) {
-                if (check_path_shared(root, path))
-                        goto out;
-        } else {
-                check_link = 0;
-        }
-        btrfs_release_path(path);
-        if (ret == 0 && S_ISREG(inode->i_mode)) {
-                ret = btrfs_lookup_file_extent(trans, root, path,
-                                               ino, (u64)-1, 0);
-                if (ret < 0) {
-                        err = ret;
-                        goto out;
                }
-                BUG_ON(ret == 0); /* Corruption */
-                if (check_path_shared(root, path))
-                        goto out;
-                btrfs_release_path(path);
-        }
-        if (!check_link) {
-                err = 0;
-                goto out;
-        }
-        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
-                                dentry->d_name.name, dentry->d_name.len, 0);
-        if (IS_ERR(di)) {
-                err = PTR_ERR(di);
-                goto out;
-        }
-        if (di) {
-                if (check_path_shared(root, path))
-                        goto out;
-        } else {
-                err = 0;
-                goto out;
-        }
-        btrfs_release_path(path);
-        ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
-                                        dentry->d_name.len, ino, dir_ino, 0,
-                                        &index);
-        if (ret) {
-                err = ret;
-                goto out;
-        }
-        if (check_path_shared(root, path))
-                goto out;
-        btrfs_release_path(path);
-        /*
-         * This is a commit root search, if we can lookup inode item and other
-         * relative items in the commit root, it means the transaction of
-         * dir/file creation has been committed, and the dir index item that we
-         * delay to insert has also been inserted into the commit root. So
-         * we needn't worry about the delayed insertion of the dir index item
-         * here.
-         */
-        di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
-                                dentry->d_name.name, dentry->d_name.len, 0);
-        if (IS_ERR(di)) {
-                err = PTR_ERR(di);
-                goto out;
-        }
-        BUG_ON(ret == -ENOENT);
-        if (check_path_shared(root, path))
-                goto out;
-        err = 0;
-out:
-        btrfs_free_path(path);
-        /* Migrate the orphan reservation over */
-        if (!err)
-                err = btrfs_block_rsv_migrate(trans->block_rsv,
-                                &root->fs_info->global_block_rsv,
-                                trans->bytes_reserved);
-        if (err) {
-                btrfs_end_transaction(trans, root);
-                root->fs_info->enospc_unlink = 0;
-                return ERR_PTR(err);
-        }
-        trans->block_rsv = &root->fs_info->global_block_rsv;
-        return trans;
-}
-static void __unlink_end_trans(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root)
-{
-        if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
-                btrfs_block_rsv_release(root, trans->block_rsv,
-                                        trans->bytes_reserved);
                trans->block_rsv = &root->fs_info->trans_block_rsv;
-                BUG_ON(!root->fs_info->enospc_unlink);
+                trans->bytes_reserved = num_bytes;
-                root->fs_info->enospc_unlink = 0;
        }
-        btrfs_end_transaction(trans, root);
+        return trans;
 }
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3880,7 +3761,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        int ret;
-        trans = __unlink_start_trans(dir, dentry);
+        trans = __unlink_start_trans(dir);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
@@ -3898,7 +3779,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        }
 out:
-        __unlink_end_trans(trans, root);
+        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
        return ret;
 }
@@ -3995,7 +3876,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
                return -EPERM;
-        trans = __unlink_start_trans(dir, dentry);
+        trans = __unlink_start_trans(dir);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
@@ -4017,7 +3898,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (!err)
                btrfs_i_size_write(inode, 0);
 out:
-        __unlink_end_trans(trans, root);
+        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
        return err;
@@ -4395,6 +4276,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
        u64 hole_size;
        int err = 0;
+        /*
+         * If our size started in the middle of a page we need to zero out the
+         * rest of the page before we expand the i_size, otherwise we could
+         * expose stale data.
+         */
+        err = btrfs_truncate_page(inode, oldsize, 0, 0);
+        if (err)
+                return err;
        if (size <= hole_start)
                return 0;
@@ -4509,9 +4399,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
        int mask = attr->ia_valid;
        int ret;
-        if (newsize == oldsize)
-                return 0;
        /*
         * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
         * special case where we need to update the times despite not having
@@ -4822,11 +4709,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
                goto out;
        }
-        if (btrfs_root_refs(&new_root->root_item) == 0) {
-                err = -ENOENT;
-                goto out;
-        }
        *sub_root = new_root;
        location->objectid = btrfs_root_dirid(&new_root->root_item);
        location->type = BTRFS_INODE_ITEM_KEY;
@@ -5092,8 +4974,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                if (!(inode->i_sb->s_flags & MS_RDONLY))
                        ret = btrfs_orphan_cleanup(sub_root);
                up_read(&root->fs_info->cleanup_work_sem);
-                if (ret)
+                if (ret) {
+                        iput(inode);
                        inode = ERR_PTR(ret);
+                }
        }
        return inode;
@@ -5137,10 +5021,9 @@ unsigned char btrfs_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
-static int btrfs_real_readdir(struct file *filp, void *dirent,
+static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
-                              filldir_t filldir)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_item *item;
        struct btrfs_dir_item *di;
@@ -5161,29 +5044,15 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        char tmp_name[32];
        char *name_ptr;
        int name_len;
-        int is_curr = 0;        /* filp->f_pos points to the current index? */
+        int is_curr = 0;        /* ctx->pos points to the current index? */
        /* FIXME, use a real flag for deciding about the key type */
        if (root->fs_info->tree_root == root)
                key_type = BTRFS_DIR_ITEM_KEY;
-        /* special case for "." */
+        if (!dir_emit_dots(file, ctx))
-        if (filp->f_pos == 0) {
+                return 0;
-                over = filldir(dirent, ".", 1,
-                               filp->f_pos, btrfs_ino(inode), DT_DIR);
-                if (over)
-                        return 0;
-                filp->f_pos = 1;
-        }
-        /* special case for .., just use the back ref */
-        if (filp->f_pos == 1) {
-                u64 pino = parent_ino(filp->f_path.dentry);
-                over = filldir(dirent, "..", 2,
-                               filp->f_pos, pino, DT_DIR);
-                if (over)
-                        return 0;
-                filp->f_pos = 2;
-        }
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -5197,7 +5066,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        }
        btrfs_set_key_type(&key, key_type);
-        key.offset = filp->f_pos;
+        key.offset = ctx->pos;
        key.objectid = btrfs_ino(inode);
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -5223,14 +5092,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                        break;
                if (btrfs_key_type(&found_key) != key_type)
                        break;
-                if (found_key.offset < filp->f_pos)
+                if (found_key.offset < ctx->pos)
                        goto next;
                if (key_type == BTRFS_DIR_INDEX_KEY &&
                    btrfs_should_delete_dir_index(&del_list,
                                                  found_key.offset))
                        goto next;
-                filp->f_pos = found_key.offset;
+                ctx->pos = found_key.offset;
                is_curr = 1;
                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
@@ -5274,9 +5143,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                                over = 0;
                                goto skip;
                        }
-                        over = filldir(dirent, name_ptr, name_len,
+                        over = !dir_emit(ctx, name_ptr, name_len,
-                                       found_key.offset, location.objectid,
+                                       location.objectid, d_type);
-                                       d_type);
 skip:
                        if (name_ptr != tmp_name)
@@ -5295,22 +5163,38 @@ next:
        if (key_type == BTRFS_DIR_INDEX_KEY) {
                if (is_curr)
-                        filp->f_pos++;
+                        ctx->pos++;
-                ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
+                ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
-                                                      &ins_list);
                if (ret)
                        goto nopos;
        }
        /* Reached end of directory/root. Bump pos past the last item. */
-        if (key_type == BTRFS_DIR_INDEX_KEY)
+        ctx->pos++;
-                /*
-                 * 32-bit glibc will use getdents64, but then strtol -
+        /*
-                 * so the last number we can serve is this.
+         * Stop new entries from being returned after we return the last
-                 */
+         * entry.
-                filp->f_pos = 0x7fffffff;
+         *
-        else
+         * New directory entries are assigned a strictly increasing
-                filp->f_pos++;
+         * offset.  This means that new entries created during readdir
+         * are *guaranteed* to be seen in the future by that readdir.
+         * This has broken buggy programs which operate on names as
+         * they're returned by readdir.  Until we re-use freed offsets
+         * we have this hack to stop new entries from being returned
+         * under the assumption that they'll never reach this huge
+         * offset.
+         *
+         * This is being careful not to overflow 32bit loff_t unless the
+         * last entry requires it because doing so has broken 32bit apps
+         * in the past.
+         */
+        if (key_type == BTRFS_DIR_INDEX_KEY) {
+                if (ctx->pos >= INT_MAX)
+                        ctx->pos = LLONG_MAX;
+                else
+                        ctx->pos = INT_MAX;
+        }
 nopos:
        ret = 0;
 err:
@@ -6518,10 +6402,10 @@ out:
 * returns 1 when the nocow is safe, < 1 on error, 0 if the
 * block must be cow'd
 */
-static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
-                                      struct inode *inode, u64 offset, u64 *len,
+                              struct inode *inode, u64 offset, u64 *len,
-                                      u64 *orig_start, u64 *orig_block_len,
+                              u64 *orig_start, u64 *orig_block_len,
-                                      u64 *ram_bytes)
+                              u64 *ram_bytes)
 {
        struct btrfs_path *path;
        int ret;
@@ -6535,7 +6419,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
        u64 num_bytes;
        int slot;
        int found_type;
+        bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -6575,18 +6459,28 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
                /* not a regular extent, must cow */
                goto out;
        }
+        if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+                goto out;
        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+        if (disk_bytenr == 0)
+                goto out;
+        if (btrfs_file_extent_compression(leaf, fi) ||
+            btrfs_file_extent_encryption(leaf, fi) ||
+            btrfs_file_extent_other_encoding(leaf, fi))
+                goto out;
        backref_offset = btrfs_file_extent_offset(leaf, fi);
-        *orig_start = key.offset - backref_offset;
+        if (orig_start) {
-        *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+                *orig_start = key.offset - backref_offset;
-        *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+                *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+                *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+        }
        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
-        if (extent_end < offset + *len) {
-                /* extent doesn't include our full range, must cow */
-                goto out;
-        }
        if (btrfs_extent_readonly(root, disk_bytenr))
                goto out;
@@ -6830,8 +6724,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                if (IS_ERR(trans))
                        goto must_cow;
-                if (can_nocow_odirect(trans, inode, start, &len, &orig_start,
+                if (can_nocow_extent(trans, inode, start, &len, &orig_start,
-                                      &orig_block_len, &ram_bytes) == 1) {
+                                     &orig_block_len, &ram_bytes) == 1) {
                        if (type == BTRFS_ORDERED_PREALLOC) {
                                free_extent_map(em);
                                em = create_pinned_em(inode, start, len,
@@ -7260,7 +7154,6 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_dio_private *dip;
-        struct bio_vec *bvec = dio_bio->bi_io_vec;
        struct bio *io_bio;
        int skip_sum;
        int write = rw & REQ_WRITE;
@@ -7282,16 +7175,9 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
        }
        dip->private = dio_bio->bi_private;
-        io_bio->bi_private = dio_bio->bi_private;
        dip->inode = inode;
        dip->logical_offset = file_offset;
+        dip->bytes = dio_bio->bi_size;
-        dip->bytes = 0;
-        do {
-                dip->bytes += bvec->bv_len;
-                bvec++;
-        } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1));
        dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
        io_bio->bi_private = dip;
        dip->errors = 0;
@@ -7390,8 +7276,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
        atomic_inc(&inode->i_dio_count);
        smp_mb__after_atomic_inc();
+        /*
+         * The generic stuff only does filemap_write_and_wait_range, which isn't
+         * enough if we've written compressed pages to this area, so we need to
+         * call btrfs_wait_ordered_range to make absolutely sure that any
+         * outstanding dirty pages are on disk.
+         */
+        count = iov_length(iov, nr_segs);
+        btrfs_wait_ordered_range(inode, offset, count);
        if (rw & WRITE) {
-                count = iov_length(iov, nr_segs);
                /*
                 * If the write DIO is beyond the EOF, we need update
                 * the isize, but it is protected by i_mutex. So we can
@@ -7510,7 +7404,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
        return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
 }
-static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+static void btrfs_invalidatepage(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        struct inode *inode = page->mapping->host;
        struct extent_io_tree *tree;
@@ -7710,16 +7605,12 @@ static int btrfs_truncate(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_block_rsv *rsv;
-        int ret;
+        int ret = 0;
        int err = 0;
        struct btrfs_trans_handle *trans;
        u64 mask = root->sectorsize - 1;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
-        ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
-        if (ret)
-                return ret;
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
@@ -7977,9 +7868,9 @@ void btrfs_destroy_inode(struct inode *inode)
         */
        smp_mb();
        if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-                spin_lock(&root->fs_info->ordered_extent_lock);
+                spin_lock(&root->fs_info->ordered_root_lock);
                list_del_init(&BTRFS_I(inode)->ordered_operations);
-                spin_unlock(&root->fs_info->ordered_extent_lock);
+                spin_unlock(&root->fs_info->ordered_root_lock);
        }
        if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
@@ -8349,7 +8240,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
 * some fairly slow code that needs optimization. This walks the list
 * of all the inodes with pending delalloc and forces them to disk.
 */
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
        struct btrfs_inode *binode;
        struct inode *inode;
@@ -8358,30 +8249,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        struct list_head splice;
        int ret = 0;
-        if (root->fs_info->sb->s_flags & MS_RDONLY)
-                return -EROFS;
        INIT_LIST_HEAD(&works);
        INIT_LIST_HEAD(&splice);
-        spin_lock(&root->fs_info->delalloc_lock);
+        spin_lock(&root->delalloc_lock);
-        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+        list_splice_init(&root->delalloc_inodes, &splice);
        while (!list_empty(&splice)) {
                binode = list_entry(splice.next, struct btrfs_inode,
                                    delalloc_inodes);
-                list_del_init(&binode->delalloc_inodes);
+                list_move_tail(&binode->delalloc_inodes,
+                               &root->delalloc_inodes);
                inode = igrab(&binode->vfs_inode);
                if (!inode) {
-                        clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                        cond_resched_lock(&root->delalloc_lock);
-                                  &binode->runtime_flags);
                        continue;
                }
+                spin_unlock(&root->delalloc_lock);
-                list_add_tail(&binode->delalloc_inodes,
-                              &root->fs_info->delalloc_inodes);
-                spin_unlock(&root->fs_info->delalloc_lock);
                work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
                if (unlikely(!work)) {
@@ -8393,16 +8277,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                                   &work->work);
                cond_resched();
-                spin_lock(&root->fs_info->delalloc_lock);
+                spin_lock(&root->delalloc_lock);
        }
-        spin_unlock(&root->fs_info->delalloc_lock);
+        spin_unlock(&root->delalloc_lock);
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
                btrfs_wait_and_free_delalloc_work(work);
        }
+        return 0;
+out:
+        list_for_each_entry_safe(work, next, &works, list) {
+                list_del_init(&work->list);
+                btrfs_wait_and_free_delalloc_work(work);
+        }
+        if (!list_empty_careful(&splice)) {
+                spin_lock(&root->delalloc_lock);
+                list_splice_tail(&splice, &root->delalloc_inodes);
+                spin_unlock(&root->delalloc_lock);
+        }
+        return ret;
+}
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+        int ret;
+        if (root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
-        /* the filemap_flush will queue IO into the worker threads, but
+        ret = __start_delalloc_inodes(root, delay_iput);
+        /*
+         * the filemap_flush will queue IO into the worker threads, but
         * we have to make sure the IO is actually started and that
         * ordered extents get created before we return
         */
@@ -8414,17 +8321,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
        }
        atomic_dec(&root->fs_info->async_submit_draining);
-        return 0;
+        return ret;
-out:
+}
-        list_for_each_entry_safe(work, next, &works, list) {
-                list_del_init(&work->list);
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
-                btrfs_wait_and_free_delalloc_work(work);
+                                    int delay_iput)
+{
+        struct btrfs_root *root;
+        struct list_head splice;
+        int ret;
+        if (fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        INIT_LIST_HEAD(&splice);
+        spin_lock(&fs_info->delalloc_root_lock);
+        list_splice_init(&fs_info->delalloc_roots, &splice);
+        while (!list_empty(&splice)) {
+                root = list_first_entry(&splice, struct btrfs_root,
+                                        delalloc_root);
+                root = btrfs_grab_fs_root(root);
+                BUG_ON(!root);
+                list_move_tail(&root->delalloc_root,
+                               &fs_info->delalloc_roots);
+                spin_unlock(&fs_info->delalloc_root_lock);
+                ret = __start_delalloc_inodes(root, delay_iput);
+                btrfs_put_fs_root(root);
+                if (ret)
+                        goto out;
+                spin_lock(&fs_info->delalloc_root_lock);
        }
+        spin_unlock(&fs_info->delalloc_root_lock);
+        atomic_inc(&fs_info->async_submit_draining);
+        while (atomic_read(&fs_info->nr_async_submits) ||
+              atomic_read(&fs_info->async_delalloc_pages)) {
+                wait_event(fs_info->async_submit_wait,
+                   (atomic_read(&fs_info->nr_async_submits) == 0 &&
+                    atomic_read(&fs_info->async_delalloc_pages) == 0));
+        }
+        atomic_dec(&fs_info->async_submit_draining);
+        return 0;
+out:
        if (!list_empty_careful(&splice)) {
-                spin_lock(&root->fs_info->delalloc_lock);
+                spin_lock(&fs_info->delalloc_root_lock);
-                list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
+                list_splice_tail(&splice, &fs_info->delalloc_roots);
-                spin_unlock(&root->fs_info->delalloc_lock);
+                spin_unlock(&fs_info->delalloc_root_lock);
        }
        return ret;
 }
@@ -8731,7 +8676,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
 static const struct file_operations btrfs_dir_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = btrfs_real_readdir,
+        .iterate        = btrfs_real_readdir,
        .unlocked_ioctl = btrfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_ioctl,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0f81d67cdc8d..238a05545ee2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
        if (!root->ref_cows)
                return -EINVAL;
+        ret = btrfs_start_delalloc_inodes(root, 0);
+        if (ret)
+                return ret;
+        btrfs_wait_ordered_extents(root, 0);
        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
        if (!pending_snapshot)
                return -ENOMEM;
@@ -2354,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
        if (ret)
                return ret;
-        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
-                        1)) {
-                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-                mnt_drop_write_file(file);
-                return -EINVAL;
-        }
-        mutex_lock(&root->fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
@@ -2369,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-        ret = btrfs_rm_device(root, vol_args->name);
-        kfree(vol_args);
+        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
-out:
+                        1)) {
+                ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+                goto out;
+        }
+        mutex_lock(&root->fs_info->volume_mutex);
+        ret = btrfs_rm_device(root, vol_args->name);
        mutex_unlock(&root->fs_info->volume_mutex);
        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+out:
+        kfree(vol_args);
        mnt_drop_write_file(file);
        return ret;
 }
@@ -2480,6 +2486,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        int ret;
        u64 len = olen;
        u64 bs = root->fs_info->sb->s_blocksize;
+        int same_inode = 0;
        /*
         * TODO:
@@ -2516,7 +2523,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        ret = -EINVAL;
        if (src == inode)
-                goto out_fput;
+                same_inode = 1;
        /* the src must be open for reading */
        if (!(src_file.file->f_mode & FMODE_READ))
@@ -2547,12 +2554,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        }
        path->reada = 2;
-        if (inode < src) {
+        if (!same_inode) {
-                mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+                if (inode < src) {
-                mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+                        mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+                        mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+                } else {
+                        mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+                        mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+                }
        } else {
-                mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+                mutex_lock(&src->i_mutex);
-                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
        }
        /* determine range to clone */
@@ -2570,6 +2581,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
            !IS_ALIGNED(destoff, bs))
                goto out_unlock;
+        /* verify if ranges are overlapped within the same file */
+        if (same_inode) {
+                if (destoff + len > off && destoff < off + len)
+                        goto out_unlock;
+        }
        if (destoff > inode->i_size) {
                ret = btrfs_cont_expand(inode, inode->i_size, destoff);
                if (ret)
@@ -2846,7 +2863,8 @@ out:
        unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
 out_unlock:
        mutex_unlock(&src->i_mutex);
-        mutex_unlock(&inode->i_mutex);
+        if (!same_inode)
+                mutex_unlock(&inode->i_mutex);
        vfree(buf);
        btrfs_free_path(path);
 out_fput:
@@ -2951,11 +2969,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
                goto out;
        }
-        if (btrfs_root_refs(&new_root->root_item) == 0) {
-                ret = -ENOENT;
-                goto out;
-        }
        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
@@ -3719,9 +3732,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
                break;
        }
-        if (copy_to_user(arg, sa, sizeof(*sa)))
-                ret = -EFAULT;
        err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
        if (err && !ret)
                ret = err;
@@ -3881,7 +3891,7 @@ drop_write:
 static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
 {
-        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+        struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
        struct btrfs_ioctl_quota_rescan_args *qsa;
        int ret;
@@ -3914,7 +3924,7 @@ drop_write:
 static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
 {
-        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+        struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
        struct btrfs_ioctl_quota_rescan_args *qsa;
        int ret = 0;
@@ -3937,6 +3947,16 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
        return ret;
 }
+static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+{
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return btrfs_qgroup_wait_for_completion(root->fs_info);
+}
 static long btrfs_ioctl_set_received_subvol(struct file *file,
                                            void __user *arg)
 {
@@ -4020,7 +4040,7 @@ out:
 static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
 {
-        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+        struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
        const char *label = root->fs_info->super_copy->label;
        size_t len = strnlen(label, BTRFS_LABEL_SIZE);
        int ret;
@@ -4039,7 +4059,7 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
 static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
 {
-        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+        struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
        struct btrfs_super_block *super_block = root->fs_info->super_copy;
        struct btrfs_trans_handle *trans;
        char label[BTRFS_LABEL_SIZE];
@@ -4179,6 +4199,8 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_quota_rescan(file, argp);
        case BTRFS_IOC_QUOTA_RESCAN_STATUS:
                return btrfs_ioctl_quota_rescan_status(file, argp);
+        case BTRFS_IOC_QUOTA_RESCAN_WAIT:
+                return btrfs_ioctl_quota_rescan_wait(file, argp);
        case BTRFS_IOC_DEV_REPLACE:
                return btrfs_ioctl_dev_replace(root, argp);
        case BTRFS_IOC_GET_FSLABEL:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 743b86fa4fcb..f93151a98886 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -31,8 +31,8 @@
 struct workspace {
        void *mem;
-        void *buf;      /* where compressed data goes */
+        void *buf;      /* where decompressed data goes */
-        void *cbuf;     /* where decompressed data goes */
+        void *cbuf;     /* where compressed data goes */
        struct list_head list;
 };
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1ddd728541ee..81369827e514 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -24,6 +24,7 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "extent_io.h"
+#include "disk-io.h"
 static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                                      u64 start, u64 len, u64 disk_len,
                                      int type, int dio, int compress_type)
 {
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
        struct btrfs_ordered_extent *entry;
@@ -227,10 +229,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                ordered_data_tree_panic(inode, -EEXIST, file_offset);
        spin_unlock_irq(&tree->lock);
-        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        spin_lock(&root->ordered_extent_lock);
        list_add_tail(&entry->root_extent_list,
-                      &BTRFS_I(inode)->root->fs_info->ordered_extents);
+                      &root->ordered_extents);
-        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        root->nr_ordered_extents++;
+        if (root->nr_ordered_extents == 1) {
+                spin_lock(&root->fs_info->ordered_root_lock);
+                BUG_ON(!list_empty(&root->ordered_root));
+                list_add_tail(&root->ordered_root,
+                              &root->fs_info->ordered_roots);
+                spin_unlock(&root->fs_info->ordered_root_lock);
+        }
+        spin_unlock(&root->ordered_extent_lock);
        return 0;
 }
@@ -516,8 +526,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
        spin_unlock_irq(&tree->lock);
-        spin_lock(&root->fs_info->ordered_extent_lock);
+        spin_lock(&root->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
+        root->nr_ordered_extents--;
        trace_btrfs_ordered_extent_remove(inode, entry);
@@ -530,7 +541,14 @@ void btrfs_remove_ordered_extent(struct inode *inode,
            !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
                list_del_init(&BTRFS_I(inode)->ordered_operations);
        }
-        spin_unlock(&root->fs_info->ordered_extent_lock);
+        if (!root->nr_ordered_extents) {
+                spin_lock(&root->fs_info->ordered_root_lock);
+                BUG_ON(list_empty(&root->ordered_root));
+                list_del_init(&root->ordered_root);
+                spin_unlock(&root->fs_info->ordered_root_lock);
+        }
+        spin_unlock(&root->ordered_extent_lock);
        wake_up(&entry->wait);
 }
@@ -550,7 +568,6 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 {
        struct list_head splice, works;
-        struct list_head *cur;
        struct btrfs_ordered_extent *ordered, *next;
        struct inode *inode;
@@ -558,35 +575,34 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
        INIT_LIST_HEAD(&works);
        mutex_lock(&root->fs_info->ordered_operations_mutex);
-        spin_lock(&root->fs_info->ordered_extent_lock);
+        spin_lock(&root->ordered_extent_lock);
-        list_splice_init(&root->fs_info->ordered_extents, &splice);
+        list_splice_init(&root->ordered_extents, &splice);
        while (!list_empty(&splice)) {
-                cur = splice.next;
+                ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
-                ordered = list_entry(cur, struct btrfs_ordered_extent,
+                                           root_extent_list);
-                                     root_extent_list);
+                list_move_tail(&ordered->root_extent_list,
-                list_del_init(&ordered->root_extent_list);
+                               &root->ordered_extents);
-                atomic_inc(&ordered->refs);
                /*
                 * the inode may be getting freed (in sys_unlink path).
                 */
                inode = igrab(ordered->inode);
+                if (!inode) {
+                        cond_resched_lock(&root->ordered_extent_lock);
+                        continue;
+                }
-                spin_unlock(&root->fs_info->ordered_extent_lock);
+                atomic_inc(&ordered->refs);
+                spin_unlock(&root->ordered_extent_lock);
-                if (inode) {
+                ordered->flush_work.func = btrfs_run_ordered_extent_work;
-                        ordered->flush_work.func = btrfs_run_ordered_extent_work;
+                list_add_tail(&ordered->work_list, &works);
-                        list_add_tail(&ordered->work_list, &works);
+                btrfs_queue_worker(&root->fs_info->flush_workers,
-                        btrfs_queue_worker(&root->fs_info->flush_workers,
+                                   &ordered->flush_work);
-                                           &ordered->flush_work);
-                } else {
-                        btrfs_put_ordered_extent(ordered);
-                }
                cond_resched();
-                spin_lock(&root->fs_info->ordered_extent_lock);
+                spin_lock(&root->ordered_extent_lock);
        }
-        spin_unlock(&root->fs_info->ordered_extent_lock);
+        spin_unlock(&root->ordered_extent_lock);
        list_for_each_entry_safe(ordered, next, &works, work_list) {
                list_del_init(&ordered->work_list);
@@ -604,6 +620,33 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
        mutex_unlock(&root->fs_info->ordered_operations_mutex);
 }
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
+                                    int delay_iput)
+{
+        struct btrfs_root *root;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        spin_lock(&fs_info->ordered_root_lock);
+        list_splice_init(&fs_info->ordered_roots, &splice);
+        while (!list_empty(&splice)) {
+                root = list_first_entry(&splice, struct btrfs_root,
+                                        ordered_root);
+                root = btrfs_grab_fs_root(root);
+                BUG_ON(!root);
+                list_move_tail(&root->ordered_root,
+                               &fs_info->ordered_roots);
+                spin_unlock(&fs_info->ordered_root_lock);
+                btrfs_wait_ordered_extents(root, delay_iput);
+                btrfs_put_fs_root(root);
+                spin_lock(&fs_info->ordered_root_lock);
+        }
+        spin_unlock(&fs_info->ordered_root_lock);
+}
 /*
 * this is used during transaction commit to write all the inodes
 * added to the ordered operation list.  These files must be fully on
@@ -629,7 +672,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
        INIT_LIST_HEAD(&works);
        mutex_lock(&root->fs_info->ordered_operations_mutex);
-        spin_lock(&root->fs_info->ordered_extent_lock);
+        spin_lock(&root->fs_info->ordered_root_lock);
        list_splice_init(&cur_trans->ordered_operations, &splice);
        while (!list_empty(&splice)) {
                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
@@ -648,17 +691,17 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
                if (!wait)
                        list_add_tail(&BTRFS_I(inode)->ordered_operations,
                                      &cur_trans->ordered_operations);
-                spin_unlock(&root->fs_info->ordered_extent_lock);
+                spin_unlock(&root->fs_info->ordered_root_lock);
                work = btrfs_alloc_delalloc_work(inode, wait, 1);
                if (!work) {
-                        spin_lock(&root->fs_info->ordered_extent_lock);
+                        spin_lock(&root->fs_info->ordered_root_lock);
                        if (list_empty(&BTRFS_I(inode)->ordered_operations))
                                list_add_tail(&btrfs_inode->ordered_operations,
                                              &splice);
                        list_splice_tail(&splice,
                                         &cur_trans->ordered_operations);
-                        spin_unlock(&root->fs_info->ordered_extent_lock);
+                        spin_unlock(&root->fs_info->ordered_root_lock);
                        ret = -ENOMEM;
                        goto out;
                }
@@ -667,9 +710,9 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
                                   &work->work);
                cond_resched();
-                spin_lock(&root->fs_info->ordered_extent_lock);
+                spin_lock(&root->fs_info->ordered_root_lock);
        }
-        spin_unlock(&root->fs_info->ordered_extent_lock);
+        spin_unlock(&root->fs_info->ordered_root_lock);
 out:
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
@@ -989,7 +1032,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
                           u32 *sum, int len)
 {
        struct btrfs_ordered_sum *ordered_sum;
-        struct btrfs_sector_sum *sector_sums;
        struct btrfs_ordered_extent *ordered;
        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
        unsigned long num_sectors;
@@ -1007,18 +1049,16 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
                    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
                        i = (disk_bytenr - ordered_sum->bytenr) >>
                            inode->i_sb->s_blocksize_bits;
-                        sector_sums = ordered_sum->sums + i;
                        num_sectors = ordered_sum->len >>
                                      inode->i_sb->s_blocksize_bits;
-                        for (; i < num_sectors; i++) {
+                        num_sectors = min_t(int, len - index, num_sectors - i);
-                                if (sector_sums[i].bytenr == disk_bytenr) {
+                        memcpy(sum + index, ordered_sum->sums + i,
-                                        sum[index] = sector_sums[i].sum;
+                               num_sectors);
-                                        index++;
-                                        if (index == len)
+                        index += (int)num_sectors;
-                                                goto out;
+                        if (index == len)
-                                        disk_bytenr += sectorsize;
+                                goto out;
-                                }
+                        disk_bytenr += num_sectors * sectorsize;
-                        }
                }
        }
 out:
@@ -1055,12 +1095,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
        if (last_mod < root->fs_info->last_trans_committed)
                return;
-        spin_lock(&root->fs_info->ordered_extent_lock);
+        spin_lock(&root->fs_info->ordered_root_lock);
        if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
                list_add_tail(&BTRFS_I(inode)->ordered_operations,
                              &cur_trans->ordered_operations);
        }
-        spin_unlock(&root->fs_info->ordered_extent_lock);
+        spin_unlock(&root->fs_info->ordered_root_lock);
 }
 int __init ordered_data_init(void)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 58b0e3b0ebad..68844d59ee6f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree {
        struct rb_node *last;
 };
-/*
- * these are used to collect checksums done just before bios submission.
- * They are attached via a list into the ordered extent, and
- * checksum items are inserted into the tree after all the blocks in
- * the ordered extent are on disk
- */
-struct btrfs_sector_sum {
-        /* bytenr on disk */
-        u64 bytenr;
-        u32 sum;
-};
 struct btrfs_ordered_sum {
        /* bytenr is the start of this extent on disk */
        u64 bytenr;
@@ -45,10 +33,10 @@ struct btrfs_ordered_sum {
        /*
         * this is the length in bytes covered by the sums array below.
         */
-        unsigned long len;
+        int len;
        struct list_head list;
-        /* last field is a variable length array of btrfs_sector_sums */
+        /* last field is a variable length array of csums */
-        struct btrfs_sector_sum sums[];
+        u32 sums[];
 };
 /*
@@ -149,11 +137,8 @@ struct btrfs_ordered_extent {
 static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
                                         unsigned long bytes)
 {
-        unsigned long num_sectors = (bytes + root->sectorsize - 1) /
+        int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
-                root->sectorsize;
+        return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
-        num_sectors++;
-        return sizeof(struct btrfs_ordered_sum) +
-                num_sectors * sizeof(struct btrfs_sector_sum);
 }
 static inline void
@@ -204,6 +189,8 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode);
 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
+                                    int delay_iput);
 void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
 void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9d49c586995a..1280eff8af56 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -98,13 +98,10 @@ struct btrfs_qgroup_list {
        struct btrfs_qgroup *member;
 };
-struct qgroup_rescan {
+static int
-        struct btrfs_work       work;
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
-        struct btrfs_fs_info    *fs_info;
+                   int init_flags);
-};
+static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
-static void qgroup_rescan_start(struct btrfs_fs_info *fs_info,
-                                struct qgroup_rescan *qscan);
 /* must be called with qgroup_ioctl_lock held */
 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
@@ -255,10 +252,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
        int slot;
        int ret = 0;
        u64 flags = 0;
+        u64 rescan_progress = 0;
        if (!fs_info->quota_enabled)
                return 0;
+        fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+        if (!fs_info->qgroup_ulist) {
+                ret = -ENOMEM;
+                goto out;
+        }
        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
@@ -306,20 +310,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
                        }
                        fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
                                                                          ptr);
-                        fs_info->qgroup_rescan_progress.objectid =
+                        rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
-                                        btrfs_qgroup_status_rescan(l, ptr);
-                        if (fs_info->qgroup_flags &
-                            BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
-                                struct qgroup_rescan *qscan =
-                                        kmalloc(sizeof(*qscan), GFP_NOFS);
-                                if (!qscan) {
-                                        ret = -ENOMEM;
-                                        goto out;
-                                }
-                                fs_info->qgroup_rescan_progress.type = 0;
-                                fs_info->qgroup_rescan_progress.offset = 0;
-                                qgroup_rescan_start(fs_info, qscan);
-                        }
                        goto next1;
                }
@@ -421,9 +412,18 @@ out:
        if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
                fs_info->quota_enabled = 0;
                fs_info->pending_quota_state = 0;
+        } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
+                   ret >= 0) {
+                ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
        }
        btrfs_free_path(path);
+        if (ret < 0) {
+                ulist_free(fs_info->qgroup_ulist);
+                fs_info->qgroup_ulist = NULL;
+                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+        }
        return ret < 0 ? ret : 0;
 }
@@ -460,6 +460,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
                }
                kfree(qgroup);
        }
+        ulist_free(fs_info->qgroup_ulist);
 }
 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
@@ -819,6 +820,12 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
                goto out;
        }
+        fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+        if (!fs_info->qgroup_ulist) {
+                ret = -ENOMEM;
+                goto out;
+        }
        /*
         * initially create the quota tree
         */
@@ -916,6 +923,10 @@ out_free_root:
                kfree(quota_root);
        }
 out:
+        if (ret) {
+                ulist_free(fs_info->qgroup_ulist);
+                fs_info->qgroup_ulist = NULL;
+        }
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        return ret;
 }
@@ -1355,7 +1366,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
        u64 ref_root;
        struct btrfs_qgroup *qgroup;
        struct ulist *roots = NULL;
-        struct ulist *tmp = NULL;
        u64 seq;
        int ret = 0;
        int sgn;
@@ -1428,14 +1438,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
        if (ret < 0)
                return ret;
-        mutex_lock(&fs_info->qgroup_rescan_lock);
        spin_lock(&fs_info->qgroup_lock);
-        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
-                if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
-                        ret = 0;
-                        goto unlock;
-                }
-        }
        quota_root = fs_info->quota_root;
        if (!quota_root)
@@ -1448,39 +1451,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
        /*
         * step 1: for each old ref, visit all nodes once and inc refcnt
         */
-        tmp = ulist_alloc(GFP_ATOMIC);
+        ulist_reinit(fs_info->qgroup_ulist);
-        if (!tmp) {
-                ret = -ENOMEM;
-                goto unlock;
-        }
        seq = fs_info->qgroup_seq;
        fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
-        ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq);
+        ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
+                                       seq);
        if (ret)
                goto unlock;
        /*
         * step 2: walk from the new root
         */
-        ret = qgroup_account_ref_step2(fs_info, roots, tmp, seq, sgn,
+        ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
-                                       node->num_bytes, qgroup);
+                                       seq, sgn, node->num_bytes, qgroup);
        if (ret)
                goto unlock;
        /*
         * step 3: walk again from old refs
         */
-        ret = qgroup_account_ref_step3(fs_info, roots, tmp, seq, sgn,
+        ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
-                                       node->num_bytes);
+                                       seq, sgn, node->num_bytes);
        if (ret)
                goto unlock;
 unlock:
        spin_unlock(&fs_info->qgroup_lock);
-        mutex_unlock(&fs_info->qgroup_rescan_lock);
        ulist_free(roots);
-        ulist_free(tmp);
        return ret;
 }
@@ -1527,9 +1525,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
        if (!ret && start_rescan_worker) {
-                ret = btrfs_qgroup_rescan(fs_info);
+                ret = qgroup_rescan_init(fs_info, 0, 1);
-                if (ret)
+                if (!ret) {
-                        pr_err("btrfs: start rescan quota failed: %d\n", ret);
+                        qgroup_rescan_zero_tracking(fs_info);
+                        btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+                                           &fs_info->qgroup_rescan_work);
+                }
                ret = 0;
        }
@@ -1720,7 +1721,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 ref_root = root->root_key.objectid;
        int ret = 0;
-        struct ulist *ulist = NULL;
        struct ulist_node *unode;
        struct ulist_iterator uiter;
@@ -1743,17 +1743,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
         * in a first step, we check all affected qgroups if any limits would
         * be exceeded
         */
-        ulist = ulist_alloc(GFP_ATOMIC);
+        ulist_reinit(fs_info->qgroup_ulist);
-        if (!ulist) {
+        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-                ret = -ENOMEM;
-                goto out;
-        }
-        ret = ulist_add(ulist, qgroup->qgroupid,
                        (uintptr_t)qgroup, GFP_ATOMIC);
        if (ret < 0)
                goto out;
        ULIST_ITER_INIT(&uiter);
-        while ((unode = ulist_next(ulist, &uiter))) {
+        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
@@ -1774,7 +1770,8 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
                }
                list_for_each_entry(glist, &qg->groups, next_group) {
-                        ret = ulist_add(ulist, glist->group->qgroupid,
+                        ret = ulist_add(fs_info->qgroup_ulist,
+                                        glist->group->qgroupid,
                                        (uintptr_t)glist->group, GFP_ATOMIC);
                        if (ret < 0)
                                goto out;
@@ -1785,7 +1782,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
         * no limits exceeded, now record the reservation into all qgroups
         */
        ULIST_ITER_INIT(&uiter);
-        while ((unode = ulist_next(ulist, &uiter))) {
+        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
                struct btrfs_qgroup *qg;
                qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
@@ -1795,8 +1792,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 out:
        spin_unlock(&fs_info->qgroup_lock);
-        ulist_free(ulist);
        return ret;
 }
@@ -1805,7 +1800,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
        struct btrfs_root *quota_root;
        struct btrfs_qgroup *qgroup;
        struct btrfs_fs_info *fs_info = root->fs_info;
-        struct ulist *ulist = NULL;
        struct ulist_node *unode;
        struct ulist_iterator uiter;
        u64 ref_root = root->root_key.objectid;
@@ -1827,17 +1821,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
        if (!qgroup)
                goto out;
-        ulist = ulist_alloc(GFP_ATOMIC);
+        ulist_reinit(fs_info->qgroup_ulist);
-        if (!ulist) {
+        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-                btrfs_std_error(fs_info, -ENOMEM);
-                goto out;
-        }
-        ret = ulist_add(ulist, qgroup->qgroupid,
                        (uintptr_t)qgroup, GFP_ATOMIC);
        if (ret < 0)
                goto out;
        ULIST_ITER_INIT(&uiter);
-        while ((unode = ulist_next(ulist, &uiter))) {
+        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
@@ -1846,7 +1836,8 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
                qg->reserved -= num_bytes;
                list_for_each_entry(glist, &qg->groups, next_group) {
-                        ret = ulist_add(ulist, glist->group->qgroupid,
+                        ret = ulist_add(fs_info->qgroup_ulist,
+                                        glist->group->qgroupid,
                                        (uintptr_t)glist->group, GFP_ATOMIC);
                        if (ret < 0)
                                goto out;
@@ -1855,7 +1846,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 out:
        spin_unlock(&fs_info->qgroup_lock);
-        ulist_free(ulist);
 }
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
@@ -1874,12 +1864,11 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
 * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
 */
 static int
-qgroup_rescan_leaf(struct qgroup_rescan *qscan, struct btrfs_path *path,
+qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
                   struct btrfs_trans_handle *trans, struct ulist *tmp,
                   struct extent_buffer *scratch_leaf)
 {
        struct btrfs_key found;
-        struct btrfs_fs_info *fs_info = qscan->fs_info;
        struct ulist *roots = NULL;
        struct ulist_node *unode;
        struct ulist_iterator uiter;
@@ -2007,11 +1996,10 @@ out:
 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 {
-        struct qgroup_rescan *qscan = container_of(work, struct qgroup_rescan,
+        struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
-                                                   work);
+                                                     qgroup_rescan_work);
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans = NULL;
-        struct btrfs_fs_info *fs_info = qscan->fs_info;
        struct ulist *tmp = NULL;
        struct extent_buffer *scratch_leaf = NULL;
        int err = -ENOMEM;
@@ -2036,7 +2024,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
                if (!fs_info->quota_enabled) {
                        err = -EINTR;
                } else {
-                        err = qgroup_rescan_leaf(qscan, path, trans,
+                        err = qgroup_rescan_leaf(fs_info, path, trans,
                                                 tmp, scratch_leaf);
                }
                if (err > 0)
@@ -2049,7 +2037,6 @@ out:
        kfree(scratch_leaf);
        ulist_free(tmp);
        btrfs_free_path(path);
-        kfree(qscan);
        mutex_lock(&fs_info->qgroup_rescan_lock);
        fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -2068,47 +2055,74 @@ out:
        } else {
                pr_err("btrfs: qgroup scan failed with %d\n", err);
        }
-}
-static void
+        complete_all(&fs_info->qgroup_rescan_completion);
-qgroup_rescan_start(struct btrfs_fs_info *fs_info, struct qgroup_rescan *qscan)
-{
-        memset(&qscan->work, 0, sizeof(qscan->work));
-        qscan->work.func = btrfs_qgroup_rescan_worker;
-        qscan->fs_info = fs_info;
-        pr_info("btrfs: qgroup scan started\n");
-        btrfs_queue_worker(&fs_info->qgroup_rescan_workers, &qscan->work);
 }
-int
+/*
-btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+ * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
+ * memory required for the rescan context.
+ */
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+                   int init_flags)
 {
        int ret = 0;
-        struct rb_node *n;
-        struct btrfs_qgroup *qgroup;
-        struct qgroup_rescan *qscan = kmalloc(sizeof(*qscan), GFP_NOFS);
-        if (!qscan)
+        if (!init_flags &&
-                return -ENOMEM;
+            (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
+             !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
+                ret = -EINVAL;
+                goto err;
+        }
        mutex_lock(&fs_info->qgroup_rescan_lock);
        spin_lock(&fs_info->qgroup_lock);
-        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
-                ret = -EINPROGRESS;
+        if (init_flags) {
-        else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+                if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
-                ret = -EINVAL;
+                        ret = -EINPROGRESS;
-        if (ret) {
+                else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
-                spin_unlock(&fs_info->qgroup_lock);
+                        ret = -EINVAL;
-                mutex_unlock(&fs_info->qgroup_rescan_lock);
-                kfree(qscan);
+                if (ret) {
-                return ret;
+                        spin_unlock(&fs_info->qgroup_lock);
+                        mutex_unlock(&fs_info->qgroup_rescan_lock);
+                        goto err;
+                }
+                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
        }
-        fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
        memset(&fs_info->qgroup_rescan_progress, 0,
                sizeof(fs_info->qgroup_rescan_progress));
+        fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+        spin_unlock(&fs_info->qgroup_lock);
+        mutex_unlock(&fs_info->qgroup_rescan_lock);
+        init_completion(&fs_info->qgroup_rescan_completion);
+        memset(&fs_info->qgroup_rescan_work, 0,
+               sizeof(fs_info->qgroup_rescan_work));
+        fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
+        if (ret) {
+err:
+                pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
+                return ret;
+        }
+        return 0;
+}
+static void
+qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
+{
+        struct rb_node *n;
+        struct btrfs_qgroup *qgroup;
+        spin_lock(&fs_info->qgroup_lock);
        /* clear all current qgroup tracking information */
        for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
                qgroup = rb_entry(n, struct btrfs_qgroup, node);
@@ -2118,9 +2132,74 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
                qgroup->excl_cmpr = 0;
        }
        spin_unlock(&fs_info->qgroup_lock);
-        mutex_unlock(&fs_info->qgroup_rescan_lock);
+}
+int
+btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+{
+        int ret = 0;
+        struct btrfs_trans_handle *trans;
-        qgroup_rescan_start(fs_info, qscan);
+        ret = qgroup_rescan_init(fs_info, 0, 1);
+        if (ret)
+                return ret;
+        /*
+         * We have set the rescan_progress to 0, which means no more
+         * delayed refs will be accounted by btrfs_qgroup_account_ref.
+         * However, btrfs_qgroup_account_ref may be right after its call
+         * to btrfs_find_all_roots, in which case it would still do the
+         * accounting.
+         * To solve this, we're committing the transaction, which will
+         * ensure we run all delayed refs and only after that, we are
+         * going to clear all tracking information for a clean start.
+         */
+        trans = btrfs_join_transaction(fs_info->fs_root);
+        if (IS_ERR(trans)) {
+                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+                return PTR_ERR(trans);
+        }
+        ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+        if (ret) {
+                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+                return ret;
+        }
+        qgroup_rescan_zero_tracking(fs_info);
+        btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+                           &fs_info->qgroup_rescan_work);
        return 0;
 }
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+{
+        int running;
+        int ret = 0;
+        mutex_lock(&fs_info->qgroup_rescan_lock);
+        spin_lock(&fs_info->qgroup_lock);
+        running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+        spin_unlock(&fs_info->qgroup_lock);
+        mutex_unlock(&fs_info->qgroup_rescan_lock);
+        if (running)
+                ret = wait_for_completion_interruptible(
+                                        &fs_info->qgroup_rescan_completion);
+        return ret;
+}
+/*
+ * this is only called from open_ctree where we're still single threaded, thus
+ * locking is omitted here.
+ */
+void
+btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
+{
+        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+                btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+                                   &fs_info->qgroup_rescan_work);
+}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4febca4fc2de..12096496cc99 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1305,6 +1305,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
        struct extent_buffer *eb;
        struct btrfs_root_item *root_item;
        struct btrfs_key root_key;
+        u64 last_snap = 0;
        int ret;
        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1320,6 +1321,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
                                      BTRFS_TREE_RELOC_OBJECTID);
                BUG_ON(ret);
+                last_snap = btrfs_root_last_snapshot(&root->root_item);
                btrfs_set_root_last_snapshot(&root->root_item,
                                             trans->transid - 1);
        } else {
@@ -1345,6 +1347,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
                memset(&root_item->drop_progress, 0,
                       sizeof(struct btrfs_disk_key));
                root_item->drop_level = 0;
+                /*
+                 * abuse rtransid, it is safe because it is impossible to
+                 * receive data into a relocation tree.
+                 */
+                btrfs_set_root_rtransid(root_item, last_snap);
+                btrfs_set_root_otransid(root_item, trans->transid);
        }
        btrfs_tree_unlock(eb);
@@ -1355,8 +1363,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        kfree(root_item);
-        reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+        reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
-                                                 &root_key);
        BUG_ON(IS_ERR(reloc_root));
        reloc_root->last_trans = trans->transid;
        return reloc_root;
@@ -2273,8 +2280,12 @@ void free_reloc_roots(struct list_head *list)
 static noinline_for_stack
 int merge_reloc_roots(struct reloc_control *rc)
 {
+        struct btrfs_trans_handle *trans;
        struct btrfs_root *root;
        struct btrfs_root *reloc_root;
+        u64 last_snap;
+        u64 otransid;
+        u64 objectid;
        LIST_HEAD(reloc_roots);
        int found = 0;
        int ret = 0;
@@ -2308,12 +2319,44 @@ again:
                } else {
                        list_del_init(&reloc_root->root_list);
                }
+                /*
+                 * we keep the old last snapshod transid in rtranid when we
+                 * created the relocation tree.
+                 */
+                last_snap = btrfs_root_rtransid(&reloc_root->root_item);
+                otransid = btrfs_root_otransid(&reloc_root->root_item);
+                objectid = reloc_root->root_key.offset;
                ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
                if (ret < 0) {
                        if (list_empty(&reloc_root->root_list))
                                list_add_tail(&reloc_root->root_list,
                                              &reloc_roots);
                        goto out;
+                } else if (!ret) {
+                        /*
+                         * recover the last snapshot tranid to avoid
+                         * the space balance break NOCOW.
+                         */
+                        root = read_fs_root(rc->extent_root->fs_info,
+                                            objectid);
+                        if (IS_ERR(root))
+                                continue;
+                        if (btrfs_root_refs(&root->root_item) == 0)
+                                continue;
+                        trans = btrfs_join_transaction(root);
+                        BUG_ON(IS_ERR(trans));
+                        /* Check if the fs/file tree was snapshoted or not. */
+                        if (btrfs_root_last_snapshot(&root->root_item) ==
+                            otransid - 1)
+                                btrfs_set_root_last_snapshot(&root->root_item,
+                                                             last_snap);
+                                
+                        btrfs_end_transaction(trans, root);
                }
        }
@@ -3266,6 +3309,8 @@ static int __add_tree_block(struct reloc_control *rc,
        struct btrfs_path *path;
        struct btrfs_key key;
        int ret;
+        bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
+                                        SKINNY_METADATA);
        if (tree_block_processed(bytenr, blocksize, rc))
                return 0;
@@ -3276,10 +3321,15 @@ static int __add_tree_block(struct reloc_control *rc,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+again:
        key.objectid = bytenr;
-        key.type = BTRFS_EXTENT_ITEM_KEY;
+        if (skinny) {
-        key.offset = blocksize;
+                key.type = BTRFS_METADATA_ITEM_KEY;
+                key.offset = (u64)-1;
+        } else {
+                key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.offset = blocksize;
+        }
        path->search_commit_root = 1;
        path->skip_locking = 1;
@@ -3287,11 +3337,23 @@ static int __add_tree_block(struct reloc_control *rc,
        if (ret < 0)
                goto out;
-        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+        if (ret > 0 && skinny) {
-        if (ret > 0) {
+                if (path->slots[0]) {
-                if (key.objectid == bytenr &&
+                        path->slots[0]--;
-                    key.type == BTRFS_METADATA_ITEM_KEY)
+                        btrfs_item_key_to_cpu(path->nodes[0], &key,
-                        ret = 0;
+                                              path->slots[0]);
+                        if (key.objectid == bytenr &&
+                            (key.type == BTRFS_METADATA_ITEM_KEY ||
+                             (key.type == BTRFS_EXTENT_ITEM_KEY &&
+                              key.offset == blocksize)))
+                                ret = 0;
+                }
+                if (ret) {
+                        skinny = false;
+                        btrfs_release_path(path);
+                        goto again;
+                }
        }
        BUG_ON(ret);
@@ -4160,12 +4222,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
               (unsigned long long)rc->block_group->key.objectid,
               (unsigned long long)rc->block_group->flags);
-        ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+        ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
        if (ret < 0) {
                err = ret;
                goto out;
        }
-        btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+        btrfs_wait_all_ordered_extents(fs_info, 0);
        while (1) {
                mutex_lock(&fs_info->cleaner_mutex);
@@ -4277,7 +4339,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
                    key.type != BTRFS_ROOT_ITEM_KEY)
                        break;
-                reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+                reloc_root = btrfs_read_fs_root(root, &key);
                if (IS_ERR(reloc_root)) {
                        err = PTR_ERR(reloc_root);
                        goto out;
@@ -4396,10 +4458,8 @@ out:
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 {
        struct btrfs_ordered_sum *sums;
-        struct btrfs_sector_sum *sector_sum;
        struct btrfs_ordered_extent *ordered;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        size_t offset;
        int ret;
        u64 disk_bytenr;
        LIST_HEAD(list);
@@ -4413,19 +4473,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
        if (ret)
                goto out;
+        disk_bytenr = ordered->start;
        while (!list_empty(&list)) {
                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
                list_del_init(&sums->list);
-                sector_sum = sums->sums;
+                sums->bytenr = disk_bytenr;
-                sums->bytenr = ordered->start;
+                disk_bytenr += sums->len;
-                offset = 0;
-                while (offset < sums->len) {
-                        sector_sum->bytenr += ordered->start - disk_bytenr;
-                        sector_sum++;
-                        offset += root->sectorsize;
-                }
                btrfs_add_ordered_sum(inode, ordered, sums);
        }
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 5bf1ed57f178..ffb1036ef10d 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -64,52 +64,59 @@ void btrfs_read_root_item(struct extent_buffer *eb, int slot,
 }
 /*
- * lookup the root with the highest offset for a given objectid.  The key we do
+ * btrfs_find_root - lookup the root by the key.
- * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0
+ * root: the root of the root tree
- * on error.
+ * search_key: the key to search
+ * path: the path we search
+ * root_item: the root item of the tree we look for
+ * root_key: the reak key of the tree we look for
+ *
+ * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
+ * of the search key, just lookup the root with the highest offset for a
+ * given objectid.
+ *
+ * If we find something return 0, otherwise > 0, < 0 on error.
 */
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
-                        struct btrfs_root_item *item, struct btrfs_key *key)
+                    struct btrfs_path *path, struct btrfs_root_item *root_item,
+                    struct btrfs_key *root_key)
 {
-        struct btrfs_path *path;
-        struct btrfs_key search_key;
        struct btrfs_key found_key;
        struct extent_buffer *l;
        int ret;
        int slot;
-        search_key.objectid = objectid;
+        ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
-        search_key.type = BTRFS_ROOT_ITEM_KEY;
-        search_key.offset = (u64)-1;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
        if (ret < 0)
-                goto out;
+                return ret;
-        BUG_ON(ret == 0);
+        if (search_key->offset != -1ULL) {      /* the search key is exact */
-        if (path->slots[0] == 0) {
+                if (ret > 0)
-                ret = 1;
+                        goto out;
-                goto out;
+        } else {
+                BUG_ON(ret == 0);               /* Logical error */
+                if (path->slots[0] == 0)
+                        goto out;
+                path->slots[0]--;
+                ret = 0;
        }
        l = path->nodes[0];
-        slot = path->slots[0] - 1;
+        slot = path->slots[0];
        btrfs_item_key_to_cpu(l, &found_key, slot);
-        if (found_key.objectid != objectid ||
+        if (found_key.objectid != search_key->objectid ||
            found_key.type != BTRFS_ROOT_ITEM_KEY) {
                ret = 1;
                goto out;
        }
-        if (item)
-                btrfs_read_root_item(l, slot, item);
-        if (key)
-                memcpy(key, &found_key, sizeof(found_key));
-        ret = 0;
+        if (root_item)
+                btrfs_read_root_item(l, slot, root_item);
+        if (root_key)
+                memcpy(root_key, &found_key, sizeof(found_key));
 out:
-        btrfs_free_path(path);
+        btrfs_release_path(path);
        return ret;
 }
@@ -212,86 +219,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        return btrfs_insert_item(trans, root, key, item, sizeof(*item));
 }
-/*
- * at mount time we want to find all the old transaction snapshots that were in
- * the process of being deleted if we crashed.  This is any root item with an
- * offset lower than the latest root.  They need to be queued for deletion to
- * finish what was happening when we crashed.
- */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
-{
-        struct btrfs_root *dead_root;
-        struct btrfs_root_item *ri;
-        struct btrfs_key key;
-        struct btrfs_key found_key;
-        struct btrfs_path *path;
-        int ret;
-        u32 nritems;
-        struct extent_buffer *leaf;
-        int slot;
-        key.objectid = objectid;
-        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-        key.offset = 0;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-again:
-        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-        if (ret < 0)
-                goto err;
-        while (1) {
-                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
-                slot = path->slots[0];
-                if (slot >= nritems) {
-                        ret = btrfs_next_leaf(root, path);
-                        if (ret)
-                                break;
-                        leaf = path->nodes[0];
-                        nritems = btrfs_header_nritems(leaf);
-                        slot = path->slots[0];
-                }
-                btrfs_item_key_to_cpu(leaf, &key, slot);
-                if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
-                        goto next;
-                if (key.objectid < objectid)
-                        goto next;
-                if (key.objectid > objectid)
-                        break;
-                ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
-                if (btrfs_disk_root_refs(leaf, ri) != 0)
-                        goto next;
-                memcpy(&found_key, &key, sizeof(key));
-                key.offset++;
-                btrfs_release_path(path);
-                dead_root =
-                        btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
-                                                    &found_key);
-                if (IS_ERR(dead_root)) {
-                        ret = PTR_ERR(dead_root);
-                        goto err;
-                }
-                ret = btrfs_add_dead_root(dead_root);
-                if (ret)
-                        goto err;
-                goto again;
-next:
-                slot++;
-                path->slots[0]++;
-        }
-        ret = 0;
-err:
-        btrfs_free_path(path);
-        return ret;
-}
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 {
        struct extent_buffer *leaf;
@@ -301,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
        struct btrfs_root *root;
        int err = 0;
        int ret;
+        bool can_recover = true;
+        if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
+                can_recover = false;
        path = btrfs_alloc_path();
        if (!path)
@@ -340,20 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                root_key.objectid = key.offset;
                key.offset++;
-                root = btrfs_read_fs_root_no_name(tree_root->fs_info,
+                root = btrfs_read_fs_root(tree_root, &root_key);
-                                                  &root_key);
+                err = PTR_RET(root);
-                if (!IS_ERR(root))
+                if (err && err != -ENOENT) {
+                        break;
+                } else if (err == -ENOENT) {
+                        struct btrfs_trans_handle *trans;
+                        btrfs_release_path(path);
+                        trans = btrfs_join_transaction(tree_root);
+                        if (IS_ERR(trans)) {
+                                err = PTR_ERR(trans);
+                                btrfs_error(tree_root->fs_info, err,
+                                            "Failed to start trans to delete "
+                                            "orphan item");
+                                break;
+                        }
+                        err = btrfs_del_orphan_item(trans, tree_root,
+                                                    root_key.objectid);
+                        btrfs_end_transaction(trans, tree_root);
+                        if (err) {
+                                btrfs_error(tree_root->fs_info, err,
+                                            "Failed to delete root orphan "
+                                            "item");
+                                break;
+                        }
                        continue;
+                }
-                ret = PTR_ERR(root);
+                if (btrfs_root_refs(&root->root_item) == 0) {
-                if (ret != -ENOENT) {
+                        btrfs_add_dead_root(root);
-                        err = ret;
+                        continue;
+                }
+                err = btrfs_init_fs_root(root);
+                if (err) {
+                        btrfs_free_fs_root(root);
                        break;
                }
-                ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
+                root->orphan_item_inserted = 1;
-                if (ret) {
-                        err = ret;
+                err = btrfs_insert_fs_root(root->fs_info, root);
+                if (err) {
+                        BUG_ON(err == -EEXIST);
+                        btrfs_free_fs_root(root);
                        break;
                }
        }
@@ -368,8 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
        struct btrfs_path *path;
        int ret;
-        struct btrfs_root_item *ri;
-        struct extent_buffer *leaf;
        path = btrfs_alloc_path();
        if (!path)
@@ -379,8 +340,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                goto out;
        BUG_ON(ret != 0);
-        leaf = path->nodes[0];
-        ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
        ret = btrfs_del_item(trans, root, path);
 out:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 79bd479317cb..64a157becbe5 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2126,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
                           u8 *csum)
 {
        struct btrfs_ordered_sum *sum = NULL;
-        int ret = 0;
+        unsigned long index;
-        unsigned long i;
        unsigned long num_sectors;
        while (!list_empty(&sctx->csum_list)) {
@@ -2146,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
        if (!sum)
                return 0;
+        index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
        num_sectors = sum->len / sctx->sectorsize;
-        for (i = 0; i < num_sectors; ++i) {
+        memcpy(csum, sum->sums + index, sctx->csum_size);
-                if (sum->sums[i].bytenr == logical) {
+        if (index == num_sectors - 1) {
-                        memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
-                        ret = 1;
-                        break;
-                }
-        }
-        if (ret && i == num_sectors - 1) {
                list_del(&sum->list);
                kfree(sum);
        }
-        return ret;
+        return 1;
 }
 /* scrub extent tries to collect up to 64 kB for each bio */
@@ -2501,10 +2495,11 @@ again:
                        ret = scrub_extent(sctx, extent_logical, extent_len,
                                           extent_physical, extent_dev, flags,
                                           generation, extent_mirror_num,
-                                           extent_physical);
+                                           extent_logical - logical + physical);
                        if (ret)
                                goto out;
+                        scrub_free_csums(sctx);
                        if (extent_logical + extent_len <
                            key.objectid + bytes) {
                                logical += increment;
@@ -3204,16 +3199,18 @@ out:
 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
 {
-        unsigned long index;
        struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
-        int ret = 0;
+        struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
        struct btrfs_key key;
-        struct inode *inode = NULL;
+        struct inode *inode;
+        struct page *page;
        struct btrfs_root *local_root;
        u64 physical_for_dev_replace;
        u64 len;
-        struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+        unsigned long index;
        int srcu_index;
+        int ret;
+        int err;
        key.objectid = root;
        key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3227,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
                return PTR_ERR(local_root);
        }
+        if (btrfs_root_refs(&local_root->root_item) == 0) {
+                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+                return -ENOENT;
+        }
        key.type = BTRFS_INODE_ITEM_KEY;
        key.objectid = inum;
        key.offset = 0;
@@ -3235,19 +3237,21 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
        if (IS_ERR(inode))
                return PTR_ERR(inode);
+        /* Avoid truncate/dio/punch hole.. */
+        mutex_lock(&inode->i_mutex);
+        inode_dio_wait(inode);
+        ret = 0;
        physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
        len = nocow_ctx->len;
        while (len >= PAGE_CACHE_SIZE) {
-                struct page *page = NULL;
-                int ret_sub;
                index = offset >> PAGE_CACHE_SHIFT;
+again:
                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
                if (!page) {
                        pr_err("find_or_create_page() failed\n");
                        ret = -ENOMEM;
-                        goto next_page;
+                        goto out;
                }
                if (PageUptodate(page)) {
@@ -3255,39 +3259,49 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
                                goto next_page;
                } else {
                        ClearPageError(page);
-                        ret_sub = extent_read_full_page(&BTRFS_I(inode)->
+                        err = extent_read_full_page(&BTRFS_I(inode)->
                                                         io_tree,
                                                        page, btrfs_get_extent,
                                                        nocow_ctx->mirror_num);
-                        if (ret_sub) {
+                        if (err) {
-                                ret = ret_sub;
+                                ret = err;
                                goto next_page;
                        }
-                        wait_on_page_locked(page);
+                        lock_page(page);
+                        /*
+                         * If the page has been remove from the page cache,
+                         * the data on it is meaningless, because it may be
+                         * old one, the new data may be written into the new
+                         * page in the page cache.
+                         */
+                        if (page->mapping != inode->i_mapping) {
+                                page_cache_release(page);
+                                goto again;
+                        }
                        if (!PageUptodate(page)) {
                                ret = -EIO;
                                goto next_page;
                        }
                }
-                ret_sub = write_page_nocow(nocow_ctx->sctx,
+                err = write_page_nocow(nocow_ctx->sctx,
-                                           physical_for_dev_replace, page);
+                                       physical_for_dev_replace, page);
-                if (ret_sub) {
+                if (err)
-                        ret = ret_sub;
+                        ret = err;
-                        goto next_page;
-                }
 next_page:
-                if (page) {
+                unlock_page(page);
-                        unlock_page(page);
+                page_cache_release(page);
-                        put_page(page);
-                }
+                if (ret)
+                        break;
                offset += PAGE_CACHE_SIZE;
                physical_for_dev_replace += PAGE_CACHE_SIZE;
                len -= PAGE_CACHE_SIZE;
        }
+out:
-        if (inode)
+        mutex_unlock(&inode->i_mutex);
-                iput(inode);
+        iput(inode);
        return ret;
 }
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ff40f1c00ce3..d3f3b43cae0b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -158,7 +158,7 @@ static void fs_path_reset(struct fs_path *p)
        }
 }
-static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
+static struct fs_path *fs_path_alloc(void)
 {
        struct fs_path *p;
@@ -173,11 +173,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
        return p;
 }
-static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
+static struct fs_path *fs_path_alloc_reversed(void)
 {
        struct fs_path *p;
-        p = fs_path_alloc(sctx);
+        p = fs_path_alloc();
        if (!p)
                return NULL;
        p->reversed = 1;
@@ -185,7 +185,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
        return p;
 }
-static void fs_path_free(struct send_ctx *sctx, struct fs_path *p)
+static void fs_path_free(struct fs_path *p)
 {
        if (!p)
                return;
@@ -753,8 +753,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
 *
 * path must point to the INODE_REF or INODE_EXTREF when called.
 */
-static int iterate_inode_ref(struct send_ctx *sctx,
+static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
-                             struct btrfs_root *root, struct btrfs_path *path,
                             struct btrfs_key *found_key, int resolve,
                             iterate_inode_ref_t iterate, void *ctx)
 {
@@ -777,13 +776,13 @@ static int iterate_inode_ref(struct send_ctx *sctx,
        unsigned long elem_size;
        unsigned long ptr;
-        p = fs_path_alloc_reversed(sctx);
+        p = fs_path_alloc_reversed();
        if (!p)
                return -ENOMEM;
        tmp_path = alloc_path_for_send();
        if (!tmp_path) {
-                fs_path_free(sctx, p);
+                fs_path_free(p);
                return -ENOMEM;
        }
@@ -858,7 +857,7 @@ static int iterate_inode_ref(struct send_ctx *sctx,
 out:
        btrfs_free_path(tmp_path);
-        fs_path_free(sctx, p);
+        fs_path_free(p);
        return ret;
 }
@@ -874,8 +873,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
 *
 * path must point to the dir item when called.
 */
-static int iterate_dir_item(struct send_ctx *sctx,
+static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
-                            struct btrfs_root *root, struct btrfs_path *path,
                            struct btrfs_key *found_key,
                            iterate_dir_item_t iterate, void *ctx)
 {
@@ -990,7 +988,7 @@ static int __copy_first_ref(int num, u64 dir, int index,
 * Retrieve the first path of an inode. If an inode has more then one
 * ref/hardlink, this is ignored.
 */
-static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
+static int get_inode_path(struct btrfs_root *root,
                          u64 ino, struct fs_path *path)
 {
        int ret;
@@ -1022,8 +1020,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
                goto out;
        }
-        ret = iterate_inode_ref(sctx, root, p, &found_key, 1,
+        ret = iterate_inode_ref(root, p, &found_key, 1,
-                        __copy_first_ref, path);
+                                __copy_first_ref, path);
        if (ret < 0)
                goto out;
        ret = 0;
@@ -1314,8 +1312,7 @@ out:
        return ret;
 }
-static int read_symlink(struct send_ctx *sctx,
+static int read_symlink(struct btrfs_root *root,
-                        struct btrfs_root *root,
                        u64 ino,
                        struct fs_path *dest)
 {
@@ -1562,8 +1559,7 @@ out:
 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
 * generation of the parent dir and the name of the dir entry.
 */
-static int get_first_ref(struct send_ctx *sctx,
+static int get_first_ref(struct btrfs_root *root, u64 ino,
-                         struct btrfs_root *root, u64 ino,
                         u64 *dir, u64 *dir_gen, struct fs_path *name)
 {
        int ret;
@@ -1628,8 +1624,7 @@ out:
        return ret;
 }
-static int is_first_ref(struct send_ctx *sctx,
+static int is_first_ref(struct btrfs_root *root,
-                        struct btrfs_root *root,
                        u64 ino, u64 dir,
                        const char *name, int name_len)
 {
@@ -1638,11 +1633,11 @@ static int is_first_ref(struct send_ctx *sctx,
        u64 tmp_dir;
        u64 tmp_dir_gen;
-        tmp_name = fs_path_alloc(sctx);
+        tmp_name = fs_path_alloc();
        if (!tmp_name)
                return -ENOMEM;
-        ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
+        ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
        if (ret < 0)
                goto out;
@@ -1654,7 +1649,7 @@ static int is_first_ref(struct send_ctx *sctx,
        ret = !memcmp(tmp_name->start, name, name_len);
 out:
-        fs_path_free(sctx, tmp_name);
+        fs_path_free(tmp_name);
        return ret;
 }
@@ -1783,11 +1778,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
        if (!sctx->parent_root)
                goto out;
-        name = fs_path_alloc(sctx);
+        name = fs_path_alloc();
        if (!name)
                return -ENOMEM;
-        ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name);
+        ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
        if (ret < 0)
                goto out;
@@ -1795,7 +1790,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
                        name->start, fs_path_len(name));
 out:
-        fs_path_free(sctx, name);
+        fs_path_free(name);
        return ret;
 }
@@ -1979,11 +1974,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
         * send_root or parent_root for ref lookup.
         */
        if (ino < sctx->send_progress)
-                ret = get_first_ref(sctx, sctx->send_root, ino,
+                ret = get_first_ref(sctx->send_root, ino,
-                                parent_ino, parent_gen, dest);
+                                    parent_ino, parent_gen, dest);
        else
-                ret = get_first_ref(sctx, sctx->parent_root, ino,
+                ret = get_first_ref(sctx->parent_root, ino,
-                                parent_ino, parent_gen, dest);
+                                    parent_ino, parent_gen, dest);
        if (ret < 0)
                goto out;
@@ -2070,7 +2065,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
        u64 parent_gen = 0;
        int stop = 0;
-        name = fs_path_alloc(sctx);
+        name = fs_path_alloc();
        if (!name) {
                ret = -ENOMEM;
                goto out;
@@ -2098,7 +2093,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
        }
 out:
-        fs_path_free(sctx, name);
+        fs_path_free(name);
        if (!ret)
                fs_path_unreverse(dest);
        return ret;
@@ -2263,7 +2258,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
 verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
-        p = fs_path_alloc(sctx);
+        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
@@ -2281,7 +2276,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
 tlv_put_failure:
 out:
-        fs_path_free(sctx, p);
+        fs_path_free(p);
        return ret;
 }
@@ -2292,7 +2287,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
 verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
-        p = fs_path_alloc(sctx);
+        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
@@ -2310,7 +2305,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
 tlv_put_failure:
 out:
-        fs_path_free(sctx, p);
+        fs_path_free(p);
        return ret;
 }
@@ -2321,7 +2316,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
 verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
-        p = fs_path_alloc(sctx);
+        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
@@ -2340,7 +2335,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
 tlv_put_failure:
 out:
-        fs_path_free(sctx, p);
+        fs_path_free(p);
        return ret;
 }
@@ -2356,7 +2351,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
 verbose_printk("btrfs: send_utimes %llu\n", ino);
-        p = fs_path_alloc(sctx);
+        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
@@ -2397,7 +2392,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
 tlv_put_failure:
 out:
-        fs_path_free(sctx, p);
+        fs_path_free(p);
        btrfs_free_path(path);
        return ret;
 }
@@ -2418,7 +2413,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
 verbose_printk("btrfs: send_create_inode %llu\n", ino);
-        p = fs_path_alloc(sctx);
+        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
@@ -2459,7 +2454,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
        if (S_ISLNK(mode)) {
                fs_path_reset(p);
-                ret = read_symlink(sctx, sctx->send_root, ino, p);
+                ret = read_symlink(sctx->send_root, ino, p);
                if (ret < 0)
                        goto out;
                TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
@@ -2476,7 +2471,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
 tlv_put_failure:
 out:
-        fs_path_free(sctx, p);
+        fs_path_free(p);
        return ret;
 }
@@ -2615,13 +2610,13 @@ static int record_ref(struct list_head *head, u64 dir,
        return 0;
 }
-static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
+static void __free_recorded_refs(struct list_head *head)
 {
        struct recorded_ref *cur;
        while (!list_empty(head)) {
                cur = list_entry(head->next, struct recorded_ref, list);
-                fs_path_free(sctx, cur->full_path);
+                fs_path_free(cur->full_path);
                list_del(&cur->list);
                kfree(cur);
        }
@@ -2629,8 +2624,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
 static void free_recorded_refs(struct send_ctx *sctx)
 {
-        __free_recorded_refs(sctx, &sctx->new_refs);
+        __free_recorded_refs(&sctx->new_refs);
-        __free_recorded_refs(sctx, &sctx->deleted_refs);
+        __free_recorded_refs(&sctx->deleted_refs);
 }
 /*
@@ -2644,7 +2639,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
        int ret;
        struct fs_path *orphan;
-        orphan = fs_path_alloc(sctx);
+        orphan = fs_path_alloc();
        if (!orphan)
                return -ENOMEM;
@@ -2655,7 +2650,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
        ret = send_rename(sctx, path, orphan);
 out:
-        fs_path_free(sctx, orphan);
+        fs_path_free(orphan);
        return ret;
 }
@@ -2746,7 +2741,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
         */
        BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
-        valid_path = fs_path_alloc(sctx);
+        valid_path = fs_path_alloc();
        if (!valid_path) {
                ret = -ENOMEM;
                goto out;
@@ -2843,9 +2838,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                if (ret < 0)
                        goto out;
                if (ret) {
-                        ret = is_first_ref(sctx, sctx->parent_root,
+                        ret = is_first_ref(sctx->parent_root,
-                                        ow_inode, cur->dir, cur->name,
+                                           ow_inode, cur->dir, cur->name,
-                                        cur->name_len);
+                                           cur->name_len);
                        if (ret < 0)
                                goto out;
                        if (ret) {
@@ -3024,7 +3019,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 out:
        free_recorded_refs(sctx);
        ulist_free(check_dirs);
-        fs_path_free(sctx, valid_path);
+        fs_path_free(valid_path);
        return ret;
 }
@@ -3037,7 +3032,7 @@ static int __record_new_ref(int num, u64 dir, int index,
        struct fs_path *p;
        u64 gen;
-        p = fs_path_alloc(sctx);
+        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
@@ -3057,7 +3052,7 @@ static int __record_new_ref(int num, u64 dir, int index,
 out:
        if (ret)
-                fs_path_free(sctx, p);
+                fs_path_free(p);
        return ret;
 }
@@ -3070,7 +3065,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
        struct fs_path *p;
        u64 gen;
-        p = fs_path_alloc(sctx);
+        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
@@ -3090,7 +3085,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
 out:
        if (ret)
-                fs_path_free(sctx, p);
+                fs_path_free(p);
        return ret;
 }
@@ -3098,8 +3093,8 @@ static int record_new_ref(struct send_ctx *sctx)
 {
        int ret;
-        ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
+        ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
-                        sctx->cmp_key, 0, __record_new_ref, sctx);
+                                sctx->cmp_key, 0, __record_new_ref, sctx);
        if (ret < 0)
                goto out;
        ret = 0;
@@ -3112,8 +3107,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
 {
        int ret;
-        ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
+        ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
-                        sctx->cmp_key, 0, __record_deleted_ref, sctx);
+                                sctx->cmp_key, 0, __record_deleted_ref, sctx);
        if (ret < 0)
                goto out;
        ret = 0;
@@ -3142,8 +3137,7 @@ static int __find_iref(int num, u64 dir, int index,
        return 0;
 }
-static int find_iref(struct send_ctx *sctx,
+static int find_iref(struct btrfs_root *root,
-                     struct btrfs_root *root,
                     struct btrfs_path *path,
                     struct btrfs_key *key,
                     u64 dir, struct fs_path *name)
@@ -3155,7 +3149,7 @@ static int find_iref(struct send_ctx *sctx,
        ctx.name = name;
        ctx.found_idx = -1;
-        ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx);
+        ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
        if (ret < 0)
                return ret;
@@ -3172,7 +3166,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index,
        int ret;
        struct send_ctx *sctx = ctx;
-        ret = find_iref(sctx, sctx->parent_root, sctx->right_path,
+        ret = find_iref(sctx->parent_root, sctx->right_path,
                        sctx->cmp_key, dir, name);
        if (ret == -ENOENT)
                ret = __record_new_ref(num, dir, index, name, sctx);
@@ -3189,7 +3183,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index,
        int ret;
        struct send_ctx *sctx = ctx;
-        ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
+        ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
                        dir, name);
        if (ret == -ENOENT)
                ret = __record_deleted_ref(num, dir, index, name, sctx);
@@ -3203,11 +3197,11 @@ static int record_changed_ref(struct send_ctx *sctx)
 {
        int ret = 0;
-        ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
+        ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
                        sctx->cmp_key, 0, __record_changed_new_ref, sctx);
        if (ret < 0)
                goto out;
-        ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
+        ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
                        sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
        if (ret < 0)
                goto out;
@@ -3266,8 +3260,7 @@ static int process_all_refs(struct send_ctx *sctx,
                     found_key.type != BTRFS_INODE_EXTREF_KEY))
                        break;
-                ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
+                ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
-                                sctx);
                btrfs_release_path(path);
                if (ret < 0)
                        goto out;
@@ -3335,7 +3328,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
        struct fs_path *p;
        posix_acl_xattr_header dummy_acl;
-        p = fs_path_alloc(sctx);
+        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
@@ -3362,7 +3355,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
        ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
 out:
-        fs_path_free(sctx, p);
+        fs_path_free(p);
        return ret;
 }
@@ -3375,7 +3368,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
        struct send_ctx *sctx = ctx;
        struct fs_path *p;
-        p = fs_path_alloc(sctx);
+        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
@@ -3386,7 +3379,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
        ret = send_remove_xattr(sctx, p, name, name_len);
 out:
-        fs_path_free(sctx, p);
+        fs_path_free(p);
        return ret;
 }
@@ -3394,8 +3387,8 @@ static int process_new_xattr(struct send_ctx *sctx)
 {
        int ret = 0;
-        ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
+        ret = iterate_dir_item(sctx->send_root, sctx->left_path,
-                        sctx->cmp_key, __process_new_xattr, sctx);
+                               sctx->cmp_key, __process_new_xattr, sctx);
        return ret;
 }
@@ -3404,8 +3397,8 @@ static int process_deleted_xattr(struct send_ctx *sctx)
 {
        int ret;
-        ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
+        ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
-                        sctx->cmp_key, __process_deleted_xattr, sctx);
+                               sctx->cmp_key, __process_deleted_xattr, sctx);
        return ret;
 }
@@ -3429,17 +3422,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
            strncmp(name, ctx->name, name_len) == 0) {
                ctx->found_idx = num;
                ctx->found_data_len = data_len;
-                ctx->found_data = kmalloc(data_len, GFP_NOFS);
+                ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
                if (!ctx->found_data)
                        return -ENOMEM;
-                memcpy(ctx->found_data, data, data_len);
                return 1;
        }
        return 0;
 }
-static int find_xattr(struct send_ctx *sctx,
+static int find_xattr(struct btrfs_root *root,
-                      struct btrfs_root *root,
                      struct btrfs_path *path,
                      struct btrfs_key *key,
                      const char *name, int name_len,
@@ -3454,7 +3445,7 @@ static int find_xattr(struct send_ctx *sctx,
        ctx.found_data = NULL;
        ctx.found_data_len = 0;
-        ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx);
+        ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
        if (ret < 0)
                return ret;
@@ -3480,9 +3471,9 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
        char *found_data = NULL;
        int found_data_len  = 0;
-        ret = find_xattr(sctx, sctx->parent_root, sctx->right_path,
+        ret = find_xattr(sctx->parent_root, sctx->right_path,
-                        sctx->cmp_key, name, name_len, &found_data,
+                         sctx->cmp_key, name, name_len, &found_data,
-                        &found_data_len);
+                         &found_data_len);
        if (ret == -ENOENT) {
                ret = __process_new_xattr(num, di_key, name, name_len, data,
                                data_len, type, ctx);
@@ -3508,8 +3499,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
        int ret;
        struct send_ctx *sctx = ctx;
-        ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
+        ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
-                        name, name_len, NULL, NULL);
+                         name, name_len, NULL, NULL);
        if (ret == -ENOENT)
                ret = __process_deleted_xattr(num, di_key, name, name_len, data,
                                data_len, type, ctx);
@@ -3523,11 +3514,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
 {
        int ret = 0;
-        ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
+        ret = iterate_dir_item(sctx->send_root, sctx->left_path,
                        sctx->cmp_key, __process_changed_new_xattr, sctx);
        if (ret < 0)
                goto out;
-        ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
+        ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
                        sctx->cmp_key, __process_changed_deleted_xattr, sctx);
 out:
@@ -3572,8 +3563,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
                        goto out;
                }
-                ret = iterate_dir_item(sctx, root, path, &found_key,
+                ret = iterate_dir_item(root, path, &found_key,
-                                __process_new_xattr, sctx);
+                                       __process_new_xattr, sctx);
                if (ret < 0)
                        goto out;
@@ -3598,7 +3589,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
        int num_read = 0;
        mm_segment_t old_fs;
-        p = fs_path_alloc(sctx);
+        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
@@ -3640,7 +3631,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 tlv_put_failure:
 out:
-        fs_path_free(sctx, p);
+        fs_path_free(p);
        set_fs(old_fs);
        if (ret < 0)
                return ret;
@@ -3663,7 +3654,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
                clone_root->root->objectid, clone_root->ino,
                clone_root->offset);
-        p = fs_path_alloc(sctx);
+        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
@@ -3686,8 +3677,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
                        goto out;
                ret = get_cur_path(sctx, clone_root->ino, gen, p);
        } else {
-                ret = get_inode_path(sctx, clone_root->root,
+                ret = get_inode_path(clone_root->root, clone_root->ino, p);
-                                clone_root->ino, p);
        }
        if (ret < 0)
                goto out;
@@ -3704,7 +3694,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
 tlv_put_failure:
 out:
-        fs_path_free(sctx, p);
+        fs_path_free(p);
        return ret;
 }
@@ -3717,7 +3707,7 @@ static int send_update_extent(struct send_ctx *sctx,
        int ret = 0;
        struct fs_path *p;
-        p = fs_path_alloc(sctx);
+        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
@@ -3737,7 +3727,7 @@ static int send_update_extent(struct send_ctx *sctx,
 tlv_put_failure:
 out:
-        fs_path_free(sctx, p);
+        fs_path_free(p);
        return ret;
 }
@@ -4579,6 +4569,41 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        send_root = BTRFS_I(file_inode(mnt_file))->root;
        fs_info = send_root->fs_info;
+        /*
+         * This is done when we lookup the root, it should already be complete
+         * by the time we get here.
+         */
+        WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
+        /*
+         * If we just created this root we need to make sure that the orphan
+         * cleanup has been done and committed since we search the commit root,
+         * so check its commit root transid with our otransid and if they match
+         * commit the transaction to make sure everything is updated.
+         */
+        down_read(&send_root->fs_info->extent_commit_sem);
+        if (btrfs_header_generation(send_root->commit_root) ==
+            btrfs_root_otransid(&send_root->root_item)) {
+                struct btrfs_trans_handle *trans;
+                up_read(&send_root->fs_info->extent_commit_sem);
+                trans = btrfs_attach_transaction_barrier(send_root);
+                if (IS_ERR(trans)) {
+                        if (PTR_ERR(trans) != -ENOENT) {
+                                ret = PTR_ERR(trans);
+                                goto out;
+                        }
+                        /* ENOENT means theres no transaction */
+                } else {
+                        ret = btrfs_commit_transaction(trans, send_root);
+                        if (ret)
+                                goto out;
+                }
+        } else {
+                up_read(&send_root->fs_info->extent_commit_sem);
+        }
        arg = memdup_user(arg_, sizeof(*arg));
        if (IS_ERR(arg)) {
                ret = PTR_ERR(arg);
@@ -4663,10 +4688,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                        key.type = BTRFS_ROOT_ITEM_KEY;
                        key.offset = (u64)-1;
                        clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
-                        if (!clone_root) {
-                                ret = -EINVAL;
-                                goto out;
-                        }
                        if (IS_ERR(clone_root)) {
                                ret = PTR_ERR(clone_root);
                                goto out;
@@ -4682,8 +4703,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                key.type = BTRFS_ROOT_ITEM_KEY;
                key.offset = (u64)-1;
                sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
-                if (!sctx->parent_root) {
+                if (IS_ERR(sctx->parent_root)) {
-                        ret = -EINVAL;
+                        ret = PTR_ERR(sctx->parent_root);
                        goto out;
                }
        }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f0857e092a3c..8eb6191d86da 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,6 @@
 #include "print-tree.h"
 #include "xattr.h"
 #include "volumes.h"
-#include "version.h"
 #include "export.h"
 #include "compression.h"
 #include "rcu-string.h"
@@ -266,6 +265,9 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                return;
        }
        ACCESS_ONCE(trans->transaction->aborted) = errno;
+        /* Wake up anybody who may be waiting on this transaction */
+        wake_up(&root->fs_info->transaction_wait);
+        wake_up(&root->fs_info->transaction_blocked_wait);
        __btrfs_std_error(root->fs_info, function, line, errno, NULL);
 }
 /*
@@ -776,9 +778,6 @@ find_root:
        if (IS_ERR(new_root))
                return ERR_CAST(new_root);
-        if (btrfs_root_refs(&new_root->root_item) == 0)
-                return ERR_PTR(-ENOENT);
        dir_id = btrfs_root_dirid(&new_root->root_item);
 setup_root:
        location.objectid = dir_id;
@@ -866,7 +865,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
                return 0;
        }
-        btrfs_wait_ordered_extents(root, 1);
+        btrfs_wait_all_ordered_extents(fs_info, 1);
        trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
@@ -1685,6 +1684,18 @@ static void btrfs_interface_exit(void)
                printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
 }
+static void btrfs_print_info(void)
+{
+        printk(KERN_INFO "Btrfs loaded"
+#ifdef CONFIG_BTRFS_DEBUG
+                        ", debug=on"
+#endif
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+                        ", integrity-checker=on"
+#endif
+                        "\n");
+}
 static int __init init_btrfs_fs(void)
 {
        int err;
@@ -1733,11 +1744,9 @@ static int __init init_btrfs_fs(void)
        btrfs_init_lockdep();
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+        btrfs_print_info();
        btrfs_test_free_space_cache();
-#endif
-        printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
        return 0;
 unregister_ioctl:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0544587d74f4..af1931a5960d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -34,12 +34,43 @@
 #define BTRFS_ROOT_TRANS_TAG 0
+static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+        [TRANS_STATE_RUNNING]           = 0U,
+        [TRANS_STATE_BLOCKED]           = (__TRANS_USERSPACE |
+                                           __TRANS_START),
+        [TRANS_STATE_COMMIT_START]      = (__TRANS_USERSPACE |
+                                           __TRANS_START |
+                                           __TRANS_ATTACH),
+        [TRANS_STATE_COMMIT_DOING]      = (__TRANS_USERSPACE |
+                                           __TRANS_START |
+                                           __TRANS_ATTACH |
+                                           __TRANS_JOIN),
+        [TRANS_STATE_UNBLOCKED]         = (__TRANS_USERSPACE |
+                                           __TRANS_START |
+                                           __TRANS_ATTACH |
+                                           __TRANS_JOIN |
+                                           __TRANS_JOIN_NOLOCK),
+        [TRANS_STATE_COMPLETED]         = (__TRANS_USERSPACE |
+                                           __TRANS_START |
+                                           __TRANS_ATTACH |
+                                           __TRANS_JOIN |
+                                           __TRANS_JOIN_NOLOCK),
+};
 static void put_transaction(struct btrfs_transaction *transaction)
 {
        WARN_ON(atomic_read(&transaction->use_count) == 0);
        if (atomic_dec_and_test(&transaction->use_count)) {
                BUG_ON(!list_empty(&transaction->list));
                WARN_ON(transaction->delayed_refs.root.rb_node);
+                while (!list_empty(&transaction->pending_chunks)) {
+                        struct extent_map *em;
+                        em = list_first_entry(&transaction->pending_chunks,
+                                              struct extent_map, list);
+                        list_del_init(&em->list);
+                        free_extent_map(em);
+                }
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
 }
@@ -50,18 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root)
        root->commit_root = btrfs_root_node(root);
 }
-static inline int can_join_transaction(struct btrfs_transaction *trans,
+static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
-                                       int type)
+                                         unsigned int type)
+{
+        if (type & TRANS_EXTWRITERS)
+                atomic_inc(&trans->num_extwriters);
+}
+static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
+                                         unsigned int type)
+{
+        if (type & TRANS_EXTWRITERS)
+                atomic_dec(&trans->num_extwriters);
+}
+static inline void extwriter_counter_init(struct btrfs_transaction *trans,
+                                          unsigned int type)
+{
+        atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
+}
+static inline int extwriter_counter_read(struct btrfs_transaction *trans)
 {
-        return !(trans->in_commit &&
+        return atomic_read(&trans->num_extwriters);
-                 type != TRANS_JOIN &&
-                 type != TRANS_JOIN_NOLOCK);
 }
 /*
 * either allocate a new transaction or hop into the existing one
 */
-static noinline int join_transaction(struct btrfs_root *root, int type)
+static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
 {
        struct btrfs_transaction *cur_trans;
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -74,32 +122,19 @@ loop:
                return -EROFS;
        }
-        if (fs_info->trans_no_join) {
-                /* 
-                 * If we are JOIN_NOLOCK we're already committing a current
-                 * transaction, we just need a handle to deal with something
-                 * when committing the transaction, such as inode cache and
-                 * space cache. It is a special case.
-                 */
-                if (type != TRANS_JOIN_NOLOCK) {
-                        spin_unlock(&fs_info->trans_lock);
-                        return -EBUSY;
-                }
-        }
        cur_trans = fs_info->running_transaction;
        if (cur_trans) {
                if (cur_trans->aborted) {
                        spin_unlock(&fs_info->trans_lock);
                        return cur_trans->aborted;
                }
-                if (!can_join_transaction(cur_trans, type)) {
+                if (btrfs_blocked_trans_types[cur_trans->state] & type) {
                        spin_unlock(&fs_info->trans_lock);
                        return -EBUSY;
                }
                atomic_inc(&cur_trans->use_count);
                atomic_inc(&cur_trans->num_writers);
-                cur_trans->num_joined++;
+                extwriter_counter_inc(cur_trans, type);
                spin_unlock(&fs_info->trans_lock);
                return 0;
        }
@@ -112,6 +147,12 @@ loop:
        if (type == TRANS_ATTACH)
                return -ENOENT;
+        /*
+         * JOIN_NOLOCK only happens during the transaction commit, so
+         * it is impossible that ->running_transaction is NULL
+         */
+        BUG_ON(type == TRANS_JOIN_NOLOCK);
        cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
        if (!cur_trans)
                return -ENOMEM;
@@ -120,7 +161,7 @@ loop:
        if (fs_info->running_transaction) {
                /*
                 * someone started a transaction after we unlocked.  Make sure
-                 * to redo the trans_no_join checks above
+                 * to redo the checks above
                 */
                kmem_cache_free(btrfs_transaction_cachep, cur_trans);
                goto loop;
@@ -131,17 +172,15 @@ loop:
        }
        atomic_set(&cur_trans->num_writers, 1);
-        cur_trans->num_joined = 0;
+        extwriter_counter_init(cur_trans, type);
        init_waitqueue_head(&cur_trans->writer_wait);
        init_waitqueue_head(&cur_trans->commit_wait);
-        cur_trans->in_commit = 0;
+        cur_trans->state = TRANS_STATE_RUNNING;
-        cur_trans->blocked = 0;
        /*
         * One for this trans handle, one so it will live on until we
         * commit the transaction.
         */
        atomic_set(&cur_trans->use_count, 2);
-        cur_trans->commit_done = 0;
        cur_trans->start_time = get_seconds();
        cur_trans->delayed_refs.root = RB_ROOT;
@@ -164,7 +203,6 @@ loop:
                        "creating a fresh transaction\n");
        atomic64_set(&fs_info->tree_mod_seq, 0);
-        spin_lock_init(&cur_trans->commit_lock);
        spin_lock_init(&cur_trans->delayed_refs.lock);
        atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
        atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
@@ -172,6 +210,7 @@ loop:
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
        INIT_LIST_HEAD(&cur_trans->ordered_operations);
+        INIT_LIST_HEAD(&cur_trans->pending_chunks);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(&cur_trans->dirty_pages,
                             fs_info->btree_inode->i_mapping);
@@ -269,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
        return 0;
 }
+static inline int is_transaction_blocked(struct btrfs_transaction *trans)
+{
+        return (trans->state >= TRANS_STATE_BLOCKED &&
+                trans->state < TRANS_STATE_UNBLOCKED &&
+                !trans->aborted);
+}
 /* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
@@ -279,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root)
        spin_lock(&root->fs_info->trans_lock);
        cur_trans = root->fs_info->running_transaction;
-        if (cur_trans && cur_trans->blocked) {
+        if (cur_trans && is_transaction_blocked(cur_trans)) {
                atomic_inc(&cur_trans->use_count);
                spin_unlock(&root->fs_info->trans_lock);
                wait_event(root->fs_info->transaction_wait,
-                           !cur_trans->blocked);
+                           cur_trans->state >= TRANS_STATE_UNBLOCKED ||
+                           cur_trans->aborted);
                put_transaction(cur_trans);
        } else {
                spin_unlock(&root->fs_info->trans_lock);
@@ -307,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
 }
 static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, int type,
+start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
                  enum btrfs_reserve_flush_enum flush)
 {
        struct btrfs_trans_handle *h;
@@ -320,7 +367,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
                return ERR_PTR(-EROFS);
        if (current->journal_info) {
-                WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+                WARN_ON(type & TRANS_EXTWRITERS);
                h = current->journal_info;
                h->use_count++;
                WARN_ON(h->use_count > 2);
@@ -366,7 +413,7 @@ again:
         * If we are ATTACH, it means we just want to catch the current
         * transaction and commit it, so we needn't do sb_start_intwrite(). 
         */
-        if (type < TRANS_JOIN_NOLOCK)
+        if (type & __TRANS_FREEZABLE)
                sb_start_intwrite(root->fs_info->sb);
        if (may_wait_transaction(root, type))
@@ -408,7 +455,8 @@ again:
        INIT_LIST_HEAD(&h->new_bgs);
        smp_mb();
-        if (cur_trans->blocked && may_wait_transaction(root, type)) {
+        if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+            may_wait_transaction(root, type)) {
                btrfs_commit_transaction(h, root);
                goto again;
        }
@@ -429,7 +477,7 @@ got_it:
        return h;
 join_fail:
-        if (type < TRANS_JOIN_NOLOCK)
+        if (type & __TRANS_FREEZABLE)
                sb_end_intwrite(root->fs_info->sb);
        kmem_cache_free(btrfs_trans_handle_cachep, h);
 alloc_fail:
@@ -490,7 +538,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
 }
 /*
- * btrfs_attach_transaction() - catch the running transaction
+ * btrfs_attach_transaction_barrier() - catch the running transaction
 *
 * It is similar to the above function, the differentia is this one
 * will wait for all the inactive transactions until they fully
@@ -512,7 +560,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
 static noinline void wait_for_commit(struct btrfs_root *root,
                                    struct btrfs_transaction *commit)
 {
-        wait_event(commit->commit_wait, commit->commit_done);
+        wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
 }
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -548,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
                spin_lock(&root->fs_info->trans_lock);
                list_for_each_entry_reverse(t, &root->fs_info->trans_list,
                                            list) {
-                        if (t->in_commit) {
+                        if (t->state >= TRANS_STATE_COMMIT_START) {
-                                if (t->commit_done)
+                                if (t->state == TRANS_STATE_COMPLETED)
                                        break;
                                cur_trans = t;
                                atomic_inc(&cur_trans->use_count);
@@ -576,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root)
 static int should_end_transaction(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root)
 {
-        int ret;
+        if (root->fs_info->global_block_rsv.space_info->full &&
+            btrfs_should_throttle_delayed_refs(trans, root))
+                return 1;
-        ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
+        return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
-        return ret ? 1 : 0;
 }
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
@@ -590,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
        int err;
        smp_mb();
-        if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+        if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+            cur_trans->delayed_refs.flushing)
                return 1;
        updates = trans->delayed_ref_updates;
@@ -609,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *info = root->fs_info;
-        int count = 0;
+        unsigned long cur = trans->delayed_ref_updates;
        int lock = (trans->type != TRANS_JOIN_NOLOCK);
        int err = 0;
@@ -638,17 +688,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        if (!list_empty(&trans->new_bgs))
                btrfs_create_pending_block_groups(trans, root);
-        while (count < 1) {
+        trans->delayed_ref_updates = 0;
-                unsigned long cur = trans->delayed_ref_updates;
+        if (btrfs_should_throttle_delayed_refs(trans, root)) {
+                cur = max_t(unsigned long, cur, 1);
                trans->delayed_ref_updates = 0;
-                if (cur &&
+                btrfs_run_delayed_refs(trans, root, cur);
-                    trans->transaction->delayed_refs.num_heads_ready > 64) {
-                        trans->delayed_ref_updates = 0;
-                        btrfs_run_delayed_refs(trans, root, cur);
-                } else {
-                        break;
-                }
-                count++;
        }
        btrfs_trans_release_metadata(trans, root);
@@ -658,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                btrfs_create_pending_block_groups(trans, root);
        if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
-            should_end_transaction(trans, root)) {
+            should_end_transaction(trans, root) &&
-                trans->transaction->blocked = 1;
+            ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
-                smp_wmb();
+                spin_lock(&info->trans_lock);
+                if (cur_trans->state == TRANS_STATE_RUNNING)
+                        cur_trans->state = TRANS_STATE_BLOCKED;
+                spin_unlock(&info->trans_lock);
        }
-        if (lock && cur_trans->blocked && !cur_trans->in_commit) {
+        if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
                if (throttle) {
                        /*
                         * We may race with somebody else here so end up having
@@ -677,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                }
        }
-        if (trans->type < TRANS_JOIN_NOLOCK)
+        if (trans->type & __TRANS_FREEZABLE)
                sb_end_intwrite(root->fs_info->sb);
        WARN_ON(cur_trans != info->running_transaction);
        WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
        atomic_dec(&cur_trans->num_writers);
+        extwriter_counter_dec(cur_trans, trans->type);
        smp_mb();
        if (waitqueue_active(&cur_trans->writer_wait))
@@ -736,9 +784,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
        struct extent_state *cached_state = NULL;
        u64 start = 0;
        u64 end;
-        struct blk_plug plug;
-        blk_start_plug(&plug);
        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
                                      mark, &cached_state)) {
                convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -752,7 +798,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
        }
        if (err)
                werr = err;
-        blk_finish_plug(&plug);
        return werr;
 }
@@ -797,8 +842,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 {
        int ret;
        int ret2;
+        struct blk_plug plug;
+        blk_start_plug(&plug);
        ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+        blk_finish_plug(&plug);
        ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
        if (ret)
@@ -935,12 +983,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
-int btrfs_add_dead_root(struct btrfs_root *root)
+void btrfs_add_dead_root(struct btrfs_root *root)
 {
        spin_lock(&root->fs_info->trans_lock);
-        list_add_tail(&root->root_list, &root->fs_info->dead_roots);
+        if (list_empty(&root->root_list))
+                list_add_tail(&root->root_list, &root->fs_info->dead_roots);
        spin_unlock(&root->fs_info->trans_lock);
-        return 0;
 }
 /*
@@ -1318,20 +1366,26 @@ static void update_super_roots(struct btrfs_root *root)
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
 {
+        struct btrfs_transaction *trans;
        int ret = 0;
        spin_lock(&info->trans_lock);
-        if (info->running_transaction)
+        trans = info->running_transaction;
-                ret = info->running_transaction->in_commit;
+        if (trans)
+                ret = (trans->state >= TRANS_STATE_COMMIT_START);
        spin_unlock(&info->trans_lock);
        return ret;
 }
 int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 {
+        struct btrfs_transaction *trans;
        int ret = 0;
        spin_lock(&info->trans_lock);
-        if (info->running_transaction)
+        trans = info->running_transaction;
-                ret = info->running_transaction->blocked;
+        if (trans)
+                ret = is_transaction_blocked(trans);
        spin_unlock(&info->trans_lock);
        return ret;
 }
@@ -1343,7 +1397,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 static void wait_current_trans_commit_start(struct btrfs_root *root,
                                            struct btrfs_transaction *trans)
 {
-        wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
+        wait_event(root->fs_info->transaction_blocked_wait,
+                   trans->state >= TRANS_STATE_COMMIT_START ||
+                   trans->aborted);
 }
 /*
@@ -1354,7 +1410,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
                                         struct btrfs_transaction *trans)
 {
        wait_event(root->fs_info->transaction_wait,
-                   trans->commit_done || (trans->in_commit && !trans->blocked));
+                   trans->state >= TRANS_STATE_UNBLOCKED ||
+                   trans->aborted);
 }
 /*
@@ -1450,26 +1507,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
        spin_lock(&root->fs_info->trans_lock);
-        if (list_empty(&cur_trans->list)) {
+        /*
-                spin_unlock(&root->fs_info->trans_lock);
+         * If the transaction is removed from the list, it means this
-                btrfs_end_transaction(trans, root);
+         * transaction has been committed successfully, so it is impossible
-                return;
+         * to call the cleanup function.
-        }
+         */
+        BUG_ON(list_empty(&cur_trans->list));
        list_del_init(&cur_trans->list);
        if (cur_trans == root->fs_info->running_transaction) {
-                root->fs_info->trans_no_join = 1;
+                cur_trans->state = TRANS_STATE_COMMIT_DOING;
                spin_unlock(&root->fs_info->trans_lock);
                wait_event(cur_trans->writer_wait,
                           atomic_read(&cur_trans->num_writers) == 1);
                spin_lock(&root->fs_info->trans_lock);
-                root->fs_info->running_transaction = NULL;
        }
        spin_unlock(&root->fs_info->trans_lock);
        btrfs_cleanup_one_transaction(trans->transaction, root);
+        spin_lock(&root->fs_info->trans_lock);
+        if (cur_trans == root->fs_info->running_transaction)
+                root->fs_info->running_transaction = NULL;
+        spin_unlock(&root->fs_info->trans_lock);
        put_transaction(cur_trans);
        put_transaction(cur_trans);
@@ -1481,33 +1543,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
                current->journal_info = NULL;
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
-        spin_lock(&root->fs_info->trans_lock);
-        root->fs_info->trans_no_join = 0;
-        spin_unlock(&root->fs_info->trans_lock);
 }
 static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root)
 {
-        int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
-        int snap_pending = 0;
        int ret;
-        if (!flush_on_commit) {
-                spin_lock(&root->fs_info->trans_lock);
-                if (!list_empty(&trans->transaction->pending_snapshots))
-                        snap_pending = 1;
-                spin_unlock(&root->fs_info->trans_lock);
-        }
-        if (flush_on_commit || snap_pending) {
-                ret = btrfs_start_delalloc_inodes(root, 1);
-                if (ret)
-                        return ret;
-                btrfs_wait_ordered_extents(root, 1);
-        }
        ret = btrfs_run_delayed_items(trans, root);
        if (ret)
                return ret;
@@ -1531,23 +1573,25 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
        return ret;
 }
-/*
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
- * btrfs_transaction state sequence:
+{
- *    in_commit = 0, blocked = 0  (initial)
+        if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
- *    in_commit = 1, blocked = 1
+                return btrfs_start_all_delalloc_inodes(fs_info, 1);
- *    blocked = 0
+        return 0;
- *    commit_done = 1
+}
- */
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+        if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+                btrfs_wait_all_ordered_extents(fs_info, 1);
+}
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
-        unsigned long joined = 0;
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_transaction *prev_trans = NULL;
-        DEFINE_WAIT(wait);
        int ret;
-        int should_grow = 0;
-        unsigned long now = get_seconds();
        ret = btrfs_run_ordered_operations(trans, root, 0);
        if (ret) {
@@ -1586,6 +1630,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         * start sending their work down.
         */
        cur_trans->delayed_refs.flushing = 1;
+        smp_wmb();
        if (!list_empty(&trans->new_bgs))
                btrfs_create_pending_block_groups(trans, root);
@@ -1596,9 +1641,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                return ret;
        }
-        spin_lock(&cur_trans->commit_lock);
+        spin_lock(&root->fs_info->trans_lock);
-        if (cur_trans->in_commit) {
+        if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
-                spin_unlock(&cur_trans->commit_lock);
+                spin_unlock(&root->fs_info->trans_lock);
                atomic_inc(&cur_trans->use_count);
                ret = btrfs_end_transaction(trans, root);
@@ -1609,16 +1654,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                return ret;
        }
-        trans->transaction->in_commit = 1;
+        cur_trans->state = TRANS_STATE_COMMIT_START;
-        trans->transaction->blocked = 1;
-        spin_unlock(&cur_trans->commit_lock);
        wake_up(&root->fs_info->transaction_blocked_wait);
-        spin_lock(&root->fs_info->trans_lock);
        if (cur_trans->list.prev != &root->fs_info->trans_list) {
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
-                if (!prev_trans->commit_done) {
+                if (prev_trans->state != TRANS_STATE_COMPLETED) {
                        atomic_inc(&prev_trans->use_count);
                        spin_unlock(&root->fs_info->trans_lock);
@@ -1632,42 +1674,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                spin_unlock(&root->fs_info->trans_lock);
        }
-        if (!btrfs_test_opt(root, SSD) &&
+        extwriter_counter_dec(cur_trans, trans->type);
-            (now < cur_trans->start_time || now - cur_trans->start_time < 1))
-                should_grow = 1;
-        do {
-                joined = cur_trans->num_joined;
-                WARN_ON(cur_trans != trans->transaction);
-                ret = btrfs_flush_all_pending_stuffs(trans, root);
-                if (ret)
-                        goto cleanup_transaction;
-                prepare_to_wait(&cur_trans->writer_wait, &wait,
+        ret = btrfs_start_delalloc_flush(root->fs_info);
-                                TASK_UNINTERRUPTIBLE);
+        if (ret)
+                goto cleanup_transaction;
-                if (atomic_read(&cur_trans->num_writers) > 1)
+        ret = btrfs_flush_all_pending_stuffs(trans, root);
-                        schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+        if (ret)
-                else if (should_grow)
+                goto cleanup_transaction;
-                        schedule_timeout(1);
-                finish_wait(&cur_trans->writer_wait, &wait);
+        wait_event(cur_trans->writer_wait,
-        } while (atomic_read(&cur_trans->num_writers) > 1 ||
+                   extwriter_counter_read(cur_trans) == 0);
-                 (should_grow && cur_trans->num_joined != joined));
+        /* some pending stuffs might be added after the previous flush. */
        ret = btrfs_flush_all_pending_stuffs(trans, root);
        if (ret)
                goto cleanup_transaction;
+        btrfs_wait_delalloc_flush(root->fs_info);
        /*
         * Ok now we need to make sure to block out any other joins while we
         * commit the transaction.  We could have started a join before setting
-         * no_join so make sure to wait for num_writers to == 1 again.
+         * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
         */
        spin_lock(&root->fs_info->trans_lock);
-        root->fs_info->trans_no_join = 1;
+        cur_trans->state = TRANS_STATE_COMMIT_DOING;
        spin_unlock(&root->fs_info->trans_lock);
        wait_event(cur_trans->writer_wait,
                   atomic_read(&cur_trans->num_writers) == 1);
@@ -1794,10 +1826,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
               sizeof(*root->fs_info->super_copy));
-        trans->transaction->blocked = 0;
        spin_lock(&root->fs_info->trans_lock);
+        cur_trans->state = TRANS_STATE_UNBLOCKED;
        root->fs_info->running_transaction = NULL;
-        root->fs_info->trans_no_join = 0;
        spin_unlock(&root->fs_info->trans_lock);
        mutex_unlock(&root->fs_info->reloc_mutex);
@@ -1825,10 +1856,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_finish_extent_commit(trans, root);
-        cur_trans->commit_done = 1;
        root->fs_info->last_trans_committed = cur_trans->transid;
+        /*
+         * We needn't acquire the lock here because there is no other task
+         * which can change it.
+         */
+        cur_trans->state = TRANS_STATE_COMPLETED;
        wake_up(&cur_trans->commit_wait);
        spin_lock(&root->fs_info->trans_lock);
@@ -1838,7 +1871,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        put_transaction(cur_trans);
        put_transaction(cur_trans);
-        if (trans->type < TRANS_JOIN_NOLOCK)
+        if (trans->type & __TRANS_FREEZABLE)
                sb_end_intwrite(root->fs_info->sb);
        trace_btrfs_transaction_commit(root);
@@ -1885,11 +1918,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
-        if (fs_info->sb->s_flags & MS_RDONLY) {
-                pr_debug("btrfs: cleaner called for RO fs!\n");
-                return 0;
-        }
        spin_lock(&fs_info->trans_lock);
        if (list_empty(&fs_info->dead_roots)) {
                spin_unlock(&fs_info->trans_lock);
@@ -1897,7 +1925,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
        }
        root = list_first_entry(&fs_info->dead_roots,
                        struct btrfs_root, root_list);
-        list_del(&root->root_list);
+        list_del_init(&root->root_list);
        spin_unlock(&fs_info->trans_lock);
        pr_debug("btrfs: cleaner removing %llu\n",
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 24c97335a59f..defbc4269897 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -22,21 +22,33 @@
 #include "delayed-ref.h"
 #include "ctree.h"
+enum btrfs_trans_state {
+        TRANS_STATE_RUNNING             = 0,
+        TRANS_STATE_BLOCKED             = 1,
+        TRANS_STATE_COMMIT_START        = 2,
+        TRANS_STATE_COMMIT_DOING        = 3,
+        TRANS_STATE_UNBLOCKED           = 4,
+        TRANS_STATE_COMPLETED           = 5,
+        TRANS_STATE_MAX                 = 6,
+};
 struct btrfs_transaction {
        u64 transid;
        /*
+         * total external writers(USERSPACE/START/ATTACH) in this
+         * transaction, it must be zero before the transaction is
+         * being committed
+         */
+        atomic_t num_extwriters;
+        /*
         * total writers in this transaction, it must be zero before the
         * transaction can end
         */
        atomic_t num_writers;
        atomic_t use_count;
-        unsigned long num_joined;
+        /* Be protected by fs_info->trans_lock when we want to change it. */
+        enum btrfs_trans_state state;
-        spinlock_t commit_lock;
-        int in_commit;
-        int commit_done;
-        int blocked;
        struct list_head list;
        struct extent_io_tree dirty_pages;
        unsigned long start_time;
@@ -44,17 +56,27 @@ struct btrfs_transaction {
        wait_queue_head_t commit_wait;
        struct list_head pending_snapshots;
        struct list_head ordered_operations;
+        struct list_head pending_chunks;
        struct btrfs_delayed_ref_root delayed_refs;
        int aborted;
 };
-enum btrfs_trans_type {
+#define __TRANS_FREEZABLE       (1U << 0)
-        TRANS_START,
-        TRANS_JOIN,
+#define __TRANS_USERSPACE       (1U << 8)
-        TRANS_USERSPACE,
+#define __TRANS_START           (1U << 9)
-        TRANS_JOIN_NOLOCK,
+#define __TRANS_ATTACH          (1U << 10)
-        TRANS_ATTACH,
+#define __TRANS_JOIN            (1U << 11)
-};
+#define __TRANS_JOIN_NOLOCK     (1U << 12)
+#define TRANS_USERSPACE         (__TRANS_USERSPACE | __TRANS_FREEZABLE)
+#define TRANS_START             (__TRANS_START | __TRANS_FREEZABLE)
+#define TRANS_ATTACH            (__TRANS_ATTACH)
+#define TRANS_JOIN              (__TRANS_JOIN | __TRANS_FREEZABLE)
+#define TRANS_JOIN_NOLOCK       (__TRANS_JOIN_NOLOCK)
+#define TRANS_EXTWRITERS        (__TRANS_USERSPACE | __TRANS_START |    \
+                                 __TRANS_ATTACH)
 struct btrfs_trans_handle {
        u64 transid;
@@ -70,7 +92,7 @@ struct btrfs_trans_handle {
        short aborted;
        short adding_csums;
        bool allocating_chunk;
-        enum btrfs_trans_type type;
+        unsigned int type;
        /*
         * this root is only needed to validate that the root passed to
         * start_transaction is the same as the one passed to end_transaction.
@@ -121,7 +143,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root);
-int btrfs_add_dead_root(struct btrfs_root *root);
+void btrfs_add_dead_root(struct btrfs_root *root);
 int btrfs_defrag_root(struct btrfs_root *root);
 int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c276ac9a0ec3..ff60d8978ae2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include <linux/list_sort.h>
 #include "ctree.h"
 #include "transaction.h"
@@ -279,11 +280,23 @@ static int process_one_buffer(struct btrfs_root *log,
 {
        int ret = 0;
+        /*
+         * If this fs is mixed then we need to be able to process the leaves to
+         * pin down any logged extents, so we have to read the block.
+         */
+        if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
+                ret = btrfs_read_buffer(eb, gen);
+                if (ret)
+                        return ret;
+        }
        if (wc->pin)
                ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
                                                      eb->start, eb->len);
        if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
+                if (wc->pin && btrfs_header_level(eb) == 0)
+                        ret = btrfs_exclude_logged_extents(log, eb);
                if (wc->write)
                        btrfs_write_tree_block(eb);
                if (wc->wait)
@@ -2016,13 +2029,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                                             eb, i, &key);
                        if (ret)
                                break;
-                } else if (key.type == BTRFS_INODE_REF_KEY) {
+                } else if (key.type == BTRFS_INODE_REF_KEY ||
-                        ret = add_inode_ref(wc->trans, root, log, path,
+                           key.type == BTRFS_INODE_EXTREF_KEY) {
-                                            eb, i, &key);
-                        if (ret && ret != -ENOENT)
-                                break;
-                        ret = 0;
-                } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
                        ret = add_inode_ref(wc->trans, root, log, path,
                                            eb, i, &key);
                        if (ret && ret != -ENOENT)
@@ -2358,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        struct btrfs_root *log = root->log_root;
        struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
        unsigned long log_transid = 0;
+        struct blk_plug plug;
        mutex_lock(&root->log_mutex);
        log_transid = root->log_transid;
@@ -2401,8 +2410,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        /* we start IO on  all the marked extents here, but we don't actually
         * wait for them until later.
         */
+        blk_start_plug(&plug);
        ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
        if (ret) {
+                blk_finish_plug(&plug);
                btrfs_abort_transaction(trans, root, ret);
                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&root->log_mutex);
@@ -2437,6 +2448,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        }
        if (ret) {
+                blk_finish_plug(&plug);
                if (ret != -ENOSPC) {
                        btrfs_abort_transaction(trans, root, ret);
                        mutex_unlock(&log_root_tree->log_mutex);
@@ -2452,6 +2464,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        index2 = log_root_tree->log_transid % 2;
        if (atomic_read(&log_root_tree->log_commit[index2])) {
+                blk_finish_plug(&plug);
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
                wait_log_commit(trans, log_root_tree,
                                log_root_tree->log_transid);
@@ -2474,6 +2487,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * check the full commit flag again
         */
        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+                blk_finish_plug(&plug);
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
@@ -2481,9 +2495,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                goto out_wake_log_root;
        }
-        ret = btrfs_write_and_wait_marked_extents(log_root_tree,
+        ret = btrfs_write_marked_extents(log_root_tree,
-                                &log_root_tree->dirty_log_pages,
+                                         &log_root_tree->dirty_log_pages,
-                                EXTENT_DIRTY | EXTENT_NEW);
+                                         EXTENT_DIRTY | EXTENT_NEW);
+        blk_finish_plug(&plug);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
                btrfs_free_logged_extents(log, log_transid);
@@ -2491,6 +2506,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                goto out_wake_log_root;
        }
        btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+        btrfs_wait_marked_extents(log_root_tree,
+                                  &log_root_tree->dirty_log_pages,
+                                  EXTENT_NEW | EXTENT_DIRTY);
        btrfs_wait_logged_extents(log, log_transid);
        btrfs_set_super_log_root(root->fs_info->super_for_commit,
@@ -3728,8 +3746,9 @@ next_slot:
        }
 log_extents:
+        btrfs_release_path(path);
+        btrfs_release_path(dst_path);
        if (fast_search) {
-                btrfs_release_path(dst_path);
                ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
                if (ret) {
                        err = ret;
@@ -3746,8 +3765,6 @@ log_extents:
        }
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
-                btrfs_release_path(path);
-                btrfs_release_path(dst_path);
                ret = log_directory_changes(trans, root, inode, path, dst_path);
                if (ret) {
                        err = ret;
@@ -4016,8 +4033,7 @@ again:
                if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
                        break;
-                log = btrfs_read_fs_root_no_radix(log_root_tree,
+                log = btrfs_read_fs_root(log_root_tree, &found_key);
-                                                  &found_key);
                if (IS_ERR(log)) {
                        ret = PTR_ERR(log);
                        btrfs_error(fs_info, ret,
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 7b417e20efe2..b0a523b2c60e 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -205,6 +205,10 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
                u64 new_alloced = ulist->nodes_alloced + 128;
                struct ulist_node *new_nodes;
                void *old = NULL;
+                int i;
+                for (i = 0; i < ulist->nnodes; i++)
+                        rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
                /*
                 * if nodes_alloced == ULIST_SIZE no memory has been allocated
@@ -224,6 +228,17 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
                ulist->nodes = new_nodes;
                ulist->nodes_alloced = new_alloced;
+                /*
+                 * krealloc actually uses memcpy, which does not copy rb_node
+                 * pointers, so we have to do it ourselves.  Otherwise we may
+                 * be bitten by crashes.
+                 */
+                for (i = 0; i < ulist->nnodes; i++) {
+                        ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
+                        if (ret < 0)
+                                return ret;
+                }
        }
        ulist->nodes[ulist->nnodes].val = val;
        ulist->nodes[ulist->nnodes].aux = aux;
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
deleted file mode 100644
index 9bf3946d5ef2..000000000000
--- a/fs/btrfs/version.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __BTRFS_VERSION_H
-#define __BTRFS_VERSION_H
-#define BTRFS_BUILD_VERSION "Btrfs"
-#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bffb9174afb..78b871753cb6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -982,6 +982,35 @@ out:
        return ret;
 }
+static int contains_pending_extent(struct btrfs_trans_handle *trans,
+                                   struct btrfs_device *device,
+                                   u64 *start, u64 len)
+{
+        struct extent_map *em;
+        int ret = 0;
+        list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
+                struct map_lookup *map;
+                int i;
+                map = (struct map_lookup *)em->bdev;
+                for (i = 0; i < map->num_stripes; i++) {
+                        if (map->stripes[i].dev != device)
+                                continue;
+                        if (map->stripes[i].physical >= *start + len ||
+                            map->stripes[i].physical + em->orig_block_len <=
+                            *start)
+                                continue;
+                        *start = map->stripes[i].physical +
+                                em->orig_block_len;
+                        ret = 1;
+                }
+        }
+        return ret;
+}
 /*
 * find_free_dev_extent - find free space in the specified device
 * @device:     the device which we search the free space in
@@ -1002,7 +1031,8 @@ out:
 * But if we don't find suitable free space, it is used to store the size of
 * the max free space.
 */
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+                         struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *len)
 {
        struct btrfs_key key;
@@ -1026,21 +1056,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
         */
        search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+again:
        max_hole_start = search_start;
        max_hole_size = 0;
        hole_size = 0;
        if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
                ret = -ENOSPC;
-                goto error;
+                goto out;
        }
-        path = btrfs_alloc_path();
-        if (!path) {
-                ret = -ENOMEM;
-                goto error;
-        }
        path->reada = 2;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
        key.objectid = device->devid;
        key.offset = search_start;
@@ -1081,6 +1112,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                if (key.offset > search_start) {
                        hole_size = key.offset - search_start;
+                        /*
+                         * Have to check before we set max_hole_start, otherwise
+                         * we could end up sending back this offset anyway.
+                         */
+                        if (contains_pending_extent(trans, device,
+                                                    &search_start,
+                                                    hole_size))
+                                hole_size = 0;
                        if (hole_size > max_hole_size) {
                                max_hole_start = search_start;
                                max_hole_size = hole_size;
@@ -1124,6 +1164,11 @@ next:
                max_hole_size = hole_size;
        }
+        if (contains_pending_extent(trans, device, &search_start, hole_size)) {
+                btrfs_release_path(path);
+                goto again;
+        }
        /* See above. */
        if (hole_size < num_bytes)
                ret = -ENOSPC;
@@ -1132,7 +1177,6 @@ next:
 out:
        btrfs_free_path(path);
-error:
        *start = max_hole_start;
        if (len)
                *len = max_hole_size;
@@ -1244,47 +1288,22 @@ out:
        return ret;
 }
-static noinline int find_next_chunk(struct btrfs_root *root,
+static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
-                                    u64 objectid, u64 *offset)
 {
-        struct btrfs_path *path;
+        struct extent_map_tree *em_tree;
-        int ret;
+        struct extent_map *em;
-        struct btrfs_key key;
+        struct rb_node *n;
-        struct btrfs_chunk *chunk;
+        u64 ret = 0;
-        struct btrfs_key found_key;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        key.objectid = objectid;
-        key.offset = (u64)-1;
-        key.type = BTRFS_CHUNK_ITEM_KEY;
-        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-        if (ret < 0)
-                goto error;
-        BUG_ON(ret == 0); /* Corruption */
-        ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
+        em_tree = &fs_info->mapping_tree.map_tree;
-        if (ret) {
+        read_lock(&em_tree->lock);
-                *offset = 0;
+        n = rb_last(&em_tree->map);
-        } else {
+        if (n) {
-                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                em = rb_entry(n, struct extent_map, rb_node);
-                                      path->slots[0]);
+                ret = em->start + em->len;
-                if (found_key.objectid != objectid)
-                        *offset = 0;
-                else {
-                        chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
-                                               struct btrfs_chunk);
-                        *offset = found_key.offset +
-                                btrfs_chunk_length(path->nodes[0], chunk);
-                }
        }
-        ret = 0;
+        read_unlock(&em_tree->lock);
-error:
-        btrfs_free_path(path);
        return ret;
 }
@@ -1462,31 +1481,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
        if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
-                printk(KERN_ERR "btrfs: unable to go below four devices "
+                ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
-                       "on raid10\n");
-                ret = -EINVAL;
                goto out;
        }
        if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
-                printk(KERN_ERR "btrfs: unable to go below two "
+                ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
-                       "devices on raid1\n");
-                ret = -EINVAL;
                goto out;
        }
        if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
            root->fs_info->fs_devices->rw_devices <= 2) {
-                printk(KERN_ERR "btrfs: unable to go below two "
+                ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
-                       "devices on raid5\n");
-                ret = -EINVAL;
                goto out;
        }
        if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
            root->fs_info->fs_devices->rw_devices <= 3) {
-                printk(KERN_ERR "btrfs: unable to go below three "
+                ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
-                       "devices on raid6\n");
-                ret = -EINVAL;
                goto out;
        }
@@ -1512,8 +1523,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                bh = NULL;
                disk_super = NULL;
                if (!device) {
-                        printk(KERN_ERR "btrfs: no missing devices found to "
+                        ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
-                               "remove\n");
                        goto out;
                }
        } else {
@@ -1535,15 +1545,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        }
        if (device->is_tgtdev_for_dev_replace) {
-                pr_err("btrfs: unable to remove the dev_replace target dev\n");
+                ret = BTRFS_ERROR_DEV_TGT_REPLACE;
-                ret = -EINVAL;
                goto error_brelse;
        }
        if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
-                printk(KERN_ERR "btrfs: unable to remove the only writeable "
+                ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
-                       "device\n");
-                ret = -EINVAL;
                goto error_brelse;
        }
@@ -3295,10 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
        }
        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
-        if (IS_ERR(tsk))
+        return PTR_RET(tsk);
-                return PTR_ERR(tsk);
-        return 0;
 }
 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
@@ -3681,10 +3685,8 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
 }
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *extent_root,
+                               struct btrfs_root *extent_root, u64 start,
-                               struct map_lookup **map_ret,
+                               u64 type)
-                               u64 *num_bytes_out, u64 *stripe_size_out,
-                               u64 start, u64 type)
 {
        struct btrfs_fs_info *info = extent_root->fs_info;
        struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -3791,7 +3793,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                if (total_avail == 0)
                        continue;
-                ret = find_free_dev_extent(device,
+                ret = find_free_dev_extent(trans, device,
                                           max_stripe_size * dev_stripes,
                                           &dev_offset, &max_avail);
                if (ret && ret != -ENOSPC)
@@ -3903,12 +3905,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        map->type = type;
        map->sub_stripes = sub_stripes;
-        *map_ret = map;
        num_bytes = stripe_size * data_stripes;
-        *stripe_size_out = stripe_size;
-        *num_bytes_out = num_bytes;
        trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
        em = alloc_extent_map();
@@ -3921,38 +3919,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        em->len = num_bytes;
        em->block_start = 0;
        em->block_len = em->len;
+        em->orig_block_len = stripe_size;
        em_tree = &extent_root->fs_info->mapping_tree.map_tree;
        write_lock(&em_tree->lock);
        ret = add_extent_mapping(em_tree, em, 0);
+        if (!ret) {
+                list_add_tail(&em->list, &trans->transaction->pending_chunks);
+                atomic_inc(&em->refs);
+        }
        write_unlock(&em_tree->lock);
        if (ret) {
                free_extent_map(em);
                goto error;
        }
-        for (i = 0; i < map->num_stripes; ++i) {
-                struct btrfs_device *device;
-                u64 dev_offset;
-                device = map->stripes[i].dev;
-                dev_offset = map->stripes[i].physical;
-                ret = btrfs_alloc_dev_extent(trans, device,
-                                info->chunk_root->root_key.objectid,
-                                BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                                start, dev_offset, stripe_size);
-                if (ret)
-                        goto error_dev_extent;
-        }
        ret = btrfs_make_block_group(trans, extent_root, 0, type,
                                     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
                                     start, num_bytes);
-        if (ret) {
+        if (ret)
-                i = map->num_stripes - 1;
+                goto error_del_extent;
-                goto error_dev_extent;
-        }
        free_extent_map(em);
        check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -3960,18 +3946,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        kfree(devices_info);
        return 0;
-error_dev_extent:
+error_del_extent:
-        for (; i >= 0; i--) {
-                struct btrfs_device *device;
-                int err;
-                device = map->stripes[i].dev;
-                err = btrfs_free_dev_extent(trans, device, start);
-                if (err) {
-                        btrfs_abort_transaction(trans, extent_root, err);
-                        break;
-                }
-        }
        write_lock(&em_tree->lock);
        remove_extent_mapping(em_tree, em);
        write_unlock(&em_tree->lock);
@@ -3986,33 +3961,68 @@ error:
        return ret;
 }
-static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
                                struct btrfs_root *extent_root,
-                                struct map_lookup *map, u64 chunk_offset,
+                                u64 chunk_offset, u64 chunk_size)
-                                u64 chunk_size, u64 stripe_size)
 {
-        u64 dev_offset;
        struct btrfs_key key;
        struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
        struct btrfs_device *device;
        struct btrfs_chunk *chunk;
        struct btrfs_stripe *stripe;
-        size_t item_size = btrfs_chunk_item_size(map->num_stripes);
+        struct extent_map_tree *em_tree;
-        int index = 0;
+        struct extent_map *em;
+        struct map_lookup *map;
+        size_t item_size;
+        u64 dev_offset;
+        u64 stripe_size;
+        int i = 0;
        int ret;
+        em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+        read_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
+        read_unlock(&em_tree->lock);
+        if (!em) {
+                btrfs_crit(extent_root->fs_info, "unable to find logical "
+                           "%Lu len %Lu", chunk_offset, chunk_size);
+                return -EINVAL;
+        }
+        if (em->start != chunk_offset || em->len != chunk_size) {
+                btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
+                          " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
+                          chunk_size, em->start, em->len);
+                free_extent_map(em);
+                return -EINVAL;
+        }
+        map = (struct map_lookup *)em->bdev;
+        item_size = btrfs_chunk_item_size(map->num_stripes);
+        stripe_size = em->orig_block_len;
        chunk = kzalloc(item_size, GFP_NOFS);
-        if (!chunk)
+        if (!chunk) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto out;
+        }
+        for (i = 0; i < map->num_stripes; i++) {
+                device = map->stripes[i].dev;
+                dev_offset = map->stripes[i].physical;
-        index = 0;
-        while (index < map->num_stripes) {
-                device = map->stripes[index].dev;
                device->bytes_used += stripe_size;
                ret = btrfs_update_device(trans, device);
                if (ret)
-                        goto out_free;
+                        goto out;
-                index++;
+                ret = btrfs_alloc_dev_extent(trans, device,
+                                             chunk_root->root_key.objectid,
+                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+                                             chunk_offset, dev_offset,
+                                             stripe_size);
+                if (ret)
+                        goto out;
        }
        spin_lock(&extent_root->fs_info->free_chunk_lock);
@@ -4020,17 +4030,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
                                                   map->num_stripes);
        spin_unlock(&extent_root->fs_info->free_chunk_lock);
-        index = 0;
        stripe = &chunk->stripe;
-        while (index < map->num_stripes) {
+        for (i = 0; i < map->num_stripes; i++) {
-                device = map->stripes[index].dev;
+                device = map->stripes[i].dev;
-                dev_offset = map->stripes[index].physical;
+                dev_offset = map->stripes[i].physical;
                btrfs_set_stack_stripe_devid(stripe, device->devid);
                btrfs_set_stack_stripe_offset(stripe, dev_offset);
                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
                stripe++;
-                index++;
        }
        btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -4048,7 +4056,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
        key.offset = chunk_offset;
        ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
        if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
                /*
                 * TODO: Cleanup of inserted chunk root in case of
@@ -4058,8 +4065,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
                                             item_size);
        }
-out_free:
+out:
        kfree(chunk);
+        free_extent_map(em);
        return ret;
 }
@@ -4074,27 +4082,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                      struct btrfs_root *extent_root, u64 type)
 {
        u64 chunk_offset;
-        u64 chunk_size;
-        u64 stripe_size;
-        struct map_lookup *map;
-        struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
-        int ret;
-        ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                              &chunk_offset);
-        if (ret)
-                return ret;
-        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+        chunk_offset = find_next_chunk(extent_root->fs_info);
-                                  &stripe_size, chunk_offset, type);
+        return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
-        if (ret)
-                return ret;
-        ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
-                                   chunk_size, stripe_size);
-        if (ret)
-                return ret;
-        return 0;
 }
 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -4103,66 +4093,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 {
        u64 chunk_offset;
        u64 sys_chunk_offset;
-        u64 chunk_size;
-        u64 sys_chunk_size;
-        u64 stripe_size;
-        u64 sys_stripe_size;
        u64 alloc_profile;
-        struct map_lookup *map;
-        struct map_lookup *sys_map;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_root *extent_root = fs_info->extent_root;
        int ret;
-        ret = find_next_chunk(fs_info->chunk_root,
+        chunk_offset = find_next_chunk(fs_info);
-                              BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
-        if (ret)
-                return ret;
        alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
-        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+        ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
-                                  &stripe_size, chunk_offset, alloc_profile);
+                                  alloc_profile);
        if (ret)
                return ret;
-        sys_chunk_offset = chunk_offset + chunk_size;
+        sys_chunk_offset = find_next_chunk(root->fs_info);
        alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
-        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
+        ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
-                                  &sys_chunk_size, &sys_stripe_size,
+                                  alloc_profile);
-                                  sys_chunk_offset, alloc_profile);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
                goto out;
        }
        ret = btrfs_add_device(trans, fs_info->chunk_root, device);
-        if (ret) {
-                btrfs_abort_transaction(trans, root, ret);
-                goto out;
-        }
-        /*
-         * Modifying chunk tree needs allocating new blocks from both
-         * system block group and metadata block group. So we only can
-         * do operations require modifying the chunk tree after both
-         * block groups were created.
-         */
-        ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
-                                   chunk_size, stripe_size);
-        if (ret) {
-                btrfs_abort_transaction(trans, root, ret);
-                goto out;
-        }
-        ret = __finish_chunk_alloc(trans, extent_root, sys_map,
-                                   sys_chunk_offset, sys_chunk_size,
-                                   sys_stripe_size);
        if (ret)
                btrfs_abort_transaction(trans, root, ret);
 out:
        return ret;
 }
@@ -4435,9 +4390,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        map = (struct map_lookup *)em->bdev;
        offset = logical - em->start;
-        if (mirror_num > map->num_stripes)
-                mirror_num = 0;
        stripe_len = map->stripe_len;
        stripe_nr = offset;
        /*
@@ -5367,7 +5319,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
                return NULL;
        list_add(&device->dev_list,
                 &fs_devices->devices);
-        device->dev_root = root->fs_info->dev_root;
        device->devid = devid;
        device->work.func = pending_bios_fn;
        device->fs_devices = fs_devices;
@@ -5593,7 +5544,6 @@ static int read_one_dev(struct btrfs_root *root,
        }
        fill_device_from_item(leaf, dev_item, device);
-        device->dev_root = root->fs_info->dev_root;
        device->in_fs_metadata = 1;
        if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                device->fs_devices->total_rw_bytes += device->total_bytes;
@@ -5751,6 +5701,17 @@ error:
        return ret;
 }
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+        struct btrfs_device *device;
+        mutex_lock(&fs_devices->device_list_mutex);
+        list_for_each_entry(device, &fs_devices->devices, dev_list)
+                device->dev_root = fs_info->dev_root;
+        mutex_unlock(&fs_devices->device_list_mutex);
+}
 static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
 {
        int i;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f6247e2a47f7..86705583480d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -316,11 +316,13 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
 int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+                         struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *max_avail);
 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
 int btrfs_get_dev_stats(struct btrfs_root *root,
                        struct btrfs_ioctl_get_dev_stats *stats);
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info);
@@ -336,6 +338,9 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
 unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
                                    struct btrfs_mapping_tree *map_tree,
                                    u64 logical);
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *extent_root,
+                                u64 chunk_offset, u64 chunk_size);
 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
                                      int index)
 {
diff --git a/fs/buffer.c b/fs/buffer.c
index d2a4d1bb2d57..4d7433534f5c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -83,6 +83,40 @@ void unlock_buffer(struct buffer_head *bh)
 EXPORT_SYMBOL(unlock_buffer);
 /*
+ * Returns if the page has dirty or writeback buffers. If all the buffers
+ * are unlocked and clean then the PageDirty information is stale. If
+ * any of the pages are locked, it is assumed they are locked for IO.
+ */
+void buffer_check_dirty_writeback(struct page *page,
+                                     bool *dirty, bool *writeback)
+{
+        struct buffer_head *head, *bh;
+        *dirty = false;
+        *writeback = false;
+        BUG_ON(!PageLocked(page));
+        if (!page_has_buffers(page))
+                return;
+        if (PageWriteback(page))
+                *writeback = true;
+        head = page_buffers(page);
+        bh = head;
+        do {
+                if (buffer_locked(bh))
+                        *writeback = true;
+                if (buffer_dirty(bh))
+                        *dirty = true;
+                bh = bh->b_this_page;
+        } while (bh != head);
+}
+EXPORT_SYMBOL(buffer_check_dirty_writeback);
+/*
 * Block until a buffer comes unlocked.  This doesn't stop it
 * from becoming locked again - you have to lock it yourself
 * if you want to preserve its state.
@@ -1454,7 +1488,8 @@ static void discard_buffer(struct buffer_head * bh)
 * block_invalidatepage - invalidate part or all of a buffer-backed page
 *
 * @page: the page which is affected
- * @offset: the index of the truncation point
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
 *
 * block_invalidatepage() is called when all or part of the page has become
 * invalidated by a truncate operation.
@@ -1465,15 +1500,22 @@ static void discard_buffer(struct buffer_head * bh)
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
-void block_invalidatepage(struct page *page, unsigned long offset)
+void block_invalidatepage(struct page *page, unsigned int offset,
+                          unsigned int length)
 {
        struct buffer_head *head, *bh, *next;
        unsigned int curr_off = 0;
+        unsigned int stop = length + offset;
        BUG_ON(!PageLocked(page));
        if (!page_has_buffers(page))
                goto out;
+        /*
+         * Check for overflow
+         */
+        BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
        head = page_buffers(page);
        bh = head;
        do {
@@ -1481,6 +1523,12 @@ void block_invalidatepage(struct page *page, unsigned long offset)
                next = bh->b_this_page;
                /*
+                 * Are we still fully in range ?
+                 */
+                if (next_off > stop)
+                        goto out;
+                /*
                 * is this block fully invalidated?
                 */
                if (offset <= curr_off)
@@ -1501,6 +1549,7 @@ out:
 }
 EXPORT_SYMBOL(block_invalidatepage);
 /*
 * We attach and possibly dirty the buffers atomically wrt
 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
@@ -2841,7 +2890,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
                 * they may have been added in ext3_writepage().  Make them
                 * freeable here, so the page does not leak.
                 */
-                do_invalidatepage(page, 0);
+                do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
                unlock_page(page);
                return 0; /* don't care */
        }
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 746ce532e130..d4c1206af9fc 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -13,8 +13,6 @@
 #include <linux/mount.h>
 #include "internal.h"
-#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
 struct cachefiles_lookup_data {
        struct cachefiles_xattr *auxdata;       /* auxiliary data */
        char                    *key;           /* key path */
@@ -212,20 +210,29 @@ static void cachefiles_update_object(struct fscache_object *_object)
        object = container_of(_object, struct cachefiles_object, fscache);
        cache = container_of(object->fscache.cache, struct cachefiles_cache,
                             cache);
+        if (!fscache_use_cookie(_object)) {
+                _leave(" [relinq]");
+                return;
+        }
        cookie = object->fscache.cookie;
        if (!cookie->def->get_aux) {
+                fscache_unuse_cookie(_object);
                _leave(" [no aux]");
                return;
        }
        auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
        if (!auxdata) {
+                fscache_unuse_cookie(_object);
                _leave(" [nomem]");
                return;
        }
        auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+        fscache_unuse_cookie(_object);
        ASSERTCMP(auxlen, <, 511);
        auxdata->len = auxlen + 1;
@@ -263,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
 #endif
        /* delete retired objects */
-        if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+        if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
            _object != cache->cache.fsdef
            ) {
                _debug("- retire object OBJ%x", object->fscache.debug_id);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 8c01c5fcdf75..25badd1aec5c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -38,7 +38,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
        printk(KERN_ERR "%sobject: OBJ%x\n",
               prefix, object->fscache.debug_id);
        printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
-               prefix, fscache_object_states[object->fscache.state],
+               prefix, object->fscache.state->name,
               object->fscache.flags, work_busy(&object->fscache.work),
               object->fscache.events, object->fscache.event_mask);
        printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -127,10 +127,10 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
 found_dentry:
        kdebug("preemptive burial: OBJ%x [%s] %p",
               object->fscache.debug_id,
-               fscache_object_states[object->fscache.state],
+               object->fscache.state->name,
               dentry);
-        if (object->fscache.state < FSCACHE_OBJECT_DYING) {
+        if (fscache_object_is_live(&object->fscache)) {
                printk(KERN_ERR "\n");
                printk(KERN_ERR "CacheFiles: Error:"
                       " Can't preemptively bury live object\n");
@@ -192,7 +192,7 @@ try_again:
        /* an old object from a previous incarnation is hogging the slot - we
         * need to wait for it to be destroyed */
 wait_for_old_object:
-        if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+        if (fscache_object_is_live(&object->fscache)) {
                printk(KERN_ERR "\n");
                printk(KERN_ERR "CacheFiles: Error:"
                       " Unexpected object collision\n");
@@ -836,7 +836,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
        //       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
        /* look up the victim */
-        mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
        start = jiffies;
        victim = lookup_one_len(filename, dir, strlen(filename));
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 317f9ee9c991..ebaff368120d 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -12,6 +12,7 @@
 #include <linux/mount.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/swap.h>
 #include "internal.h"
 /*
@@ -227,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
 */
 static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
                                            struct fscache_retrieval *op,
-                                            struct page *netpage,
+                                            struct page *netpage)
-                                            struct pagevec *pagevec)
 {
        struct cachefiles_one_read *monitor;
        struct address_space *bmapping;
@@ -237,8 +237,6 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
        _enter("");
-        pagevec_reinit(pagevec);
        _debug("read back %p{%lu,%d}",
               netpage, netpage->index, page_count(netpage));
@@ -283,9 +281,7 @@ installed_new_backing_page:
        backpage = newpage;
        newpage = NULL;
-        page_cache_get(backpage);
+        lru_cache_add_file(backpage);
-        pagevec_add(pagevec, backpage);
-        __pagevec_lru_add_file(pagevec);
 read_backing_page:
        ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -452,8 +448,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
        if (block) {
                /* submit the apparently valid page to the backing fs to be
                 * read from disk */
-                ret = cachefiles_read_backing_file_one(object, op, page,
+                ret = cachefiles_read_backing_file_one(object, op, page);
-                                                       &pagevec);
        } else if (cachefiles_has_space(cache, 0, 1) == 0) {
                /* there's space in the cache we can use */
                fscache_mark_page_cached(op, page);
@@ -482,14 +477,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 {
        struct cachefiles_one_read *monitor = NULL;
        struct address_space *bmapping = object->backer->d_inode->i_mapping;
-        struct pagevec lru_pvec;
        struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
        int ret = 0;
        _enter("");
-        pagevec_init(&lru_pvec, 0);
        list_for_each_entry_safe(netpage, _n, list, lru) {
                list_del(&netpage->lru);
@@ -534,9 +526,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
                backpage = newpage;
                newpage = NULL;
-                page_cache_get(backpage);
+                lru_cache_add_file(backpage);
-                if (!pagevec_add(&lru_pvec, backpage))
-                        __pagevec_lru_add_file(&lru_pvec);
        reread_backing_page:
                ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -559,9 +549,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
                        goto nomem;
                }
-                page_cache_get(netpage);
+                lru_cache_add_file(netpage);
-                if (!pagevec_add(&lru_pvec, netpage))
-                        __pagevec_lru_add_file(&lru_pvec);
                /* install a monitor */
                page_cache_get(netpage);
@@ -643,9 +631,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
                fscache_mark_page_cached(op, netpage);
-                page_cache_get(netpage);
+                lru_cache_add_file(netpage);
-                if (!pagevec_add(&lru_pvec, netpage))
-                        __pagevec_lru_add_file(&lru_pvec);
                /* the netpage is unlocked and marked up to date here */
                fscache_end_io(op, netpage, 0);
@@ -661,8 +647,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 out:
        /* tidy up */
-        pagevec_lru_add_file(&lru_pvec);
        if (newpage)
                page_cache_release(newpage);
        if (netpage)
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 73b46288b54b..2476e5162609 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -109,13 +109,12 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
        struct dentry *dentry = object->dentry;
        int ret;
-        ASSERT(object->fscache.cookie);
        ASSERT(dentry);
        _enter("%p,#%d", object, auxdata->len);
        /* attempt to install the cache metadata directly */
-        _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+        _debug("SET #%u", auxdata->len);
        ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
                           &auxdata->type, auxdata->len,
@@ -138,13 +137,12 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
        struct dentry *dentry = object->dentry;
        int ret;
-        ASSERT(object->fscache.cookie);
        ASSERT(dentry);
        _enter("%p,#%d", object, auxdata->len);
        /* attempt to install the cache metadata directly */
-        _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+        _debug("SET #%u", auxdata->len);
        ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
                           &auxdata->type, auxdata->len,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f1d6c60ab229..722585cd5c7e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page)
 * dirty page counters appropriately.  Only called if there is private
 * data on the page.
 */
-static void ceph_invalidatepage(struct page *page, unsigned long offset)
+static void ceph_invalidatepage(struct page *page, unsigned int offset,
+                                unsigned int length)
 {
        struct inode *inode;
        struct ceph_inode_info *ci;
@@ -159,20 +160,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
        if (!PageDirty(page))
                pr_err("%p invalidatepage %p page not dirty\n", inode, page);
-        if (offset == 0)
+        if (offset == 0 && length == PAGE_CACHE_SIZE)
                ClearPageChecked(page);
        ci = ceph_inode(inode);
-        if (offset == 0) {
+        if (offset == 0 && length == PAGE_CACHE_SIZE) {
-                dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
+                dout("%p invalidatepage %p idx %lu full dirty page\n",
-                     inode, page, page->index, offset);
+                     inode, page, page->index);
                ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
                ceph_put_snap_context(snapc);
                page->private = 0;
                ClearPagePrivate(page);
        } else {
-                dout("%p invalidatepage %p idx %lu partial dirty page\n",
+                dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
-                     inode, page, page->index);
+                     inode, page, page->index, offset, length);
        }
 }
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0e4da4a9c213..868b61d56cac 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -111,11 +111,10 @@ static unsigned fpos_off(loff_t p)
 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
 * the MDS if/when the directory is modified).
 */
-static int __dcache_readdir(struct file *filp,
+static int __dcache_readdir(struct file *file, struct dir_context *ctx)
-                            void *dirent, filldir_t filldir)
 {
-        struct ceph_file_info *fi = filp->private_data;
+        struct ceph_file_info *fi = file->private_data;
-        struct dentry *parent = filp->f_dentry;
+        struct dentry *parent = file->f_dentry;
        struct inode *dir = parent->d_inode;
        struct list_head *p;
        struct dentry *dentry, *last;
@@ -126,14 +125,14 @@ static int __dcache_readdir(struct file *filp,
        last = fi->dentry;
        fi->dentry = NULL;
-        dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+        dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
             last);
        spin_lock(&parent->d_lock);
        /* start at beginning? */
-        if (filp->f_pos == 2 || last == NULL ||
+        if (ctx->pos == 2 || last == NULL ||
-            filp->f_pos < ceph_dentry(last)->offset) {
+            ctx->pos < ceph_dentry(last)->offset) {
                if (list_empty(&parent->d_subdirs))
                        goto out_unlock;
                p = parent->d_subdirs.prev;
@@ -157,11 +156,11 @@ more:
                if (!d_unhashed(dentry) && dentry->d_inode &&
                    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
                    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
-                    filp->f_pos <= di->offset)
+                    ctx->pos <= di->offset)
                        break;
                dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
                     dentry->d_name.len, dentry->d_name.name, di->offset,
-                     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+                     ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
                     !dentry->d_inode ? " null" : "");
                spin_unlock(&dentry->d_lock);
                p = p->prev;
@@ -173,29 +172,27 @@ more:
        spin_unlock(&dentry->d_lock);
        spin_unlock(&parent->d_lock);
-        dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+        dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
             dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-        filp->f_pos = di->offset;
+        ctx->pos = di->offset;
-        err = filldir(dirent, dentry->d_name.name,
+        if (!dir_emit(ctx, dentry->d_name.name,
-                      dentry->d_name.len, di->offset,
+                      dentry->d_name.len,
                      ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
-                      dentry->d_inode->i_mode >> 12);
+                      dentry->d_inode->i_mode >> 12)) {
+                if (last) {
-        if (last) {
-                if (err < 0) {
                        /* remember our position */
                        fi->dentry = last;
                        fi->next_offset = di->offset;
-                } else {
-                        dput(last);
                }
+                dput(dentry);
+                return 0;
        }
-        last = dentry;
-        if (err < 0)
+        if (last)
-                goto out;
+                dput(last);
+        last = dentry;
-        filp->f_pos++;
+        ctx->pos++;
        /* make sure a dentry wasn't dropped while we didn't have parent lock */
        if (!ceph_dir_is_complete(dir)) {
@@ -235,59 +232,59 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
        return 0;
 }
-static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct ceph_file_info *fi = filp->private_data;
+        struct ceph_file_info *fi = file->private_data;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_mds_client *mdsc = fsc->mdsc;
-        unsigned frag = fpos_frag(filp->f_pos);
+        unsigned frag = fpos_frag(ctx->pos);
-        int off = fpos_off(filp->f_pos);
+        int off = fpos_off(ctx->pos);
        int err;
        u32 ftype;
        struct ceph_mds_reply_info_parsed *rinfo;
        const int max_entries = fsc->mount_options->max_readdir;
        const int max_bytes = fsc->mount_options->max_readdir_bytes;
-        dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+        dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
        if (fi->flags & CEPH_F_ATEND)
                return 0;
        /* always start with . and .. */
-        if (filp->f_pos == 0) {
+        if (ctx->pos == 0) {
                /* note dir version at start of readdir so we can tell
                 * if any dentries get dropped */
                fi->dir_release_count = atomic_read(&ci->i_release_count);
                dout("readdir off 0 -> '.'\n");
-                if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+                if (!dir_emit(ctx, ".", 1, 
                            ceph_translate_ino(inode->i_sb, inode->i_ino),
-                            inode->i_mode >> 12) < 0)
+                            inode->i_mode >> 12))
                        return 0;
-                filp->f_pos = 1;
+                ctx->pos = 1;
                off = 1;
        }
-        if (filp->f_pos == 1) {
+        if (ctx->pos == 1) {
-                ino_t ino = parent_ino(filp->f_dentry);
+                ino_t ino = parent_ino(file->f_dentry);
                dout("readdir off 1 -> '..'\n");
-                if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+                if (!dir_emit(ctx, "..", 2,
                            ceph_translate_ino(inode->i_sb, ino),
-                            inode->i_mode >> 12) < 0)
+                            inode->i_mode >> 12))
                        return 0;
-                filp->f_pos = 2;
+                ctx->pos = 2;
                off = 2;
        }
        /* can we use the dcache? */
        spin_lock(&ci->i_ceph_lock);
-        if ((filp->f_pos == 2 || fi->dentry) &&
+        if ((ctx->pos == 2 || fi->dentry) &&
            !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
            ceph_snap(inode) != CEPH_SNAPDIR &&
            __ceph_dir_is_complete(ci) &&
            __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
                spin_unlock(&ci->i_ceph_lock);
-                err = __dcache_readdir(filp, dirent, filldir);
+                err = __dcache_readdir(file, ctx);
                if (err != -EAGAIN)
                        return err;
        } else {
@@ -327,7 +324,7 @@ more:
                        return PTR_ERR(req);
                req->r_inode = inode;
                ihold(inode);
-                req->r_dentry = dget(filp->f_dentry);
+                req->r_dentry = dget(file->f_dentry);
                /* hints to request -> mds selection code */
                req->r_direct_mode = USE_AUTH_MDS;
                req->r_direct_hash = ceph_frag_value(frag);
@@ -379,15 +376,16 @@ more:
        rinfo = &fi->last_readdir->r_reply_info;
        dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
             rinfo->dir_nr, off, fi->offset);
+        ctx->pos = ceph_make_fpos(frag, off);
        while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
-                u64 pos = ceph_make_fpos(frag, off);
                struct ceph_mds_reply_inode *in =
                        rinfo->dir_in[off - fi->offset].in;
                struct ceph_vino vino;
                ino_t ino;
                dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
-                     off, off - fi->offset, rinfo->dir_nr, pos,
+                     off, off - fi->offset, rinfo->dir_nr, ctx->pos,
                     rinfo->dir_dname_len[off - fi->offset],
                     rinfo->dir_dname[off - fi->offset], in);
                BUG_ON(!in);
@@ -395,16 +393,15 @@ more:
                vino.ino = le64_to_cpu(in->ino);
                vino.snap = le64_to_cpu(in->snapid);
                ino = ceph_vino_to_ino(vino);
-                if (filldir(dirent,
+                if (!dir_emit(ctx,
                            rinfo->dir_dname[off - fi->offset],
                            rinfo->dir_dname_len[off - fi->offset],
-                            pos,
+                            ceph_translate_ino(inode->i_sb, ino), ftype)) {
-                            ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
                        dout("filldir stopping us...\n");
                        return 0;
                }
                off++;
-                filp->f_pos = pos + 1;
+                ctx->pos++;
        }
        if (fi->last_name) {
@@ -417,7 +414,7 @@ more:
        if (!ceph_frag_is_rightmost(frag)) {
                frag = ceph_frag_next(frag);
                off = 0;
-                filp->f_pos = ceph_make_fpos(frag, off);
+                ctx->pos = ceph_make_fpos(frag, off);
                dout("readdir next frag is %x\n", frag);
                goto more;
        }
@@ -432,11 +429,11 @@ more:
        if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
                dout(" marking %p complete\n", inode);
                __ceph_dir_set_complete(ci, fi->dir_release_count);
-                ci->i_max_offset = filp->f_pos;
+                ci->i_max_offset = ctx->pos;
        }
        spin_unlock(&ci->i_ceph_lock);
-        dout("readdir %p filp %p done.\n", inode, filp);
+        dout("readdir %p file %p done.\n", inode, file);
        return 0;
 }
@@ -1270,7 +1267,7 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
 const struct file_operations ceph_dir_fops = {
        .read = ceph_read_dir,
-        .readdir = ceph_readdir,
+        .iterate = ceph_readdir,
        .llseek = ceph_dir_llseek,
        .open = ceph_open,
        .release = ceph_release,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a17ffe4ec3ca..bc0735498d29 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -861,16 +861,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
                break;
        }
-        if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
+        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-                offset = -EINVAL;
-                goto out;
-        }
-        /* Special lock needed here? */
-        if (offset != file->f_pos) {
-                file->f_pos = offset;
-                file->f_version = 0;
-        }
 out:
        mutex_unlock(&inode->i_mutex);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 3b0abed667c2..98b6e50bde04 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -911,8 +911,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
        } else if (realdn) {
                dout("dn %p (%d) spliced with %p (%d) "
                     "inode %p ino %llx.%llx\n",
-                     dn, dn->d_count,
+                     dn, d_count(dn),
-                     realdn, realdn->d_count,
+                     realdn, d_count(realdn),
                     realdn->d_inode, ceph_vinop(realdn->d_inode));
                dput(dn);
                dn = realdn;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 89788515a63d..ae6d14e82b0f 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -192,7 +192,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 /**
 * Encode the flock and fcntl locks for the given inode into the ceph_filelock
- * array. Must be called with lock_flocks() already held.
+ * array. Must be called with inode->i_lock already held.
 * If we encounter more of a specific lock type than expected, return -ENOSPC.
 */
 int ceph_encode_locks_to_buffer(struct inode *inode,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index cbf08203e00d..603786b564be 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1588,7 +1588,7 @@ retry:
        *base = ceph_ino(temp->d_inode);
        *plen = len;
        dout("build_path on %p %d built %llx '%.*s'\n",
-             dentry, dentry->d_count, *base, len, path);
+             dentry, d_count(dentry), *base, len, path);
        return path;
 }
@@ -2517,20 +2517,20 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                struct ceph_filelock *flocks;
 encode_again:
-                lock_flocks();
+                spin_lock(&inode->i_lock);
                ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
-                unlock_flocks();
+                spin_unlock(&inode->i_lock);
                flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
                                 sizeof(struct ceph_filelock), GFP_NOFS);
                if (!flocks) {
                        err = -ENOMEM;
                        goto out_free;
                }
-                lock_flocks();
+                spin_lock(&inode->i_lock);
                err = ceph_encode_locks_to_buffer(inode, flocks,
                                                  num_fcntl_locks,
                                                  num_flock_locks);
-                unlock_flocks();
+                spin_unlock(&inode->i_lock);
                if (err) {
                        kfree(flocks);
                        if (err == -ENOSPC)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 2906ee276408..603f18a65c12 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -10,6 +10,7 @@ config CIFS
        select CRYPTO_ECB
        select CRYPTO_DES
        select CRYPTO_SHA256
+        select CRYPTO_CMAC
        help
          This is the client VFS module for the Common Internet File System
          (CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index d59748346020..f3ac4154cbb6 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -213,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                                                   tcon->nativeFileSystem);
                                }
                                seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
-                                        "\nPathComponentMax: %d Status: 0x%d",
+                                        "\n\tPathComponentMax: %d Status: 0x%d",
                                        le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
                                        le32_to_cpu(tcon->fsAttrInfo.Attributes),
                                        le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
@@ -224,6 +224,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                                        seq_puts(m, " type: CDROM ");
                                else
                                        seq_printf(m, " type: %d ", dev_type);
+                                if (server->ops->dump_share_caps)
+                                        server->ops->dump_share_caps(m, tcon);
                                if (tcon->need_reconnect)
                                        seq_puts(m, "\tDISCONNECTED ");
@@ -595,9 +597,36 @@ static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
        return single_open(file, cifs_security_flags_proc_show, NULL);
 }
+/*
+ * Ensure that if someone sets a MUST flag, that we disable all other MAY
+ * flags except for the ones corresponding to the given MUST flag. If there are
+ * multiple MUST flags, then try to prefer more secure ones.
+ */
+static void
+cifs_security_flags_handle_must_flags(unsigned int *flags)
+{
+        unsigned int signflags = *flags & CIFSSEC_MUST_SIGN;
+        if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
+                *flags = CIFSSEC_MUST_KRB5;
+        else if ((*flags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
+                *flags = CIFSSEC_MUST_NTLMSSP;
+        else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
+                *flags = CIFSSEC_MUST_NTLMV2;
+        else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
+                *flags = CIFSSEC_MUST_NTLM;
+        else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
+                *flags = CIFSSEC_MUST_LANMAN;
+        else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
+                *flags = CIFSSEC_MUST_PLNTXT;
+        *flags |= signflags;
+}
 static ssize_t cifs_security_flags_proc_write(struct file *file,
                const char __user *buffer, size_t count, loff_t *ppos)
 {
+        int rc;
        unsigned int flags;
        char flags_string[12];
        char c;
@@ -620,26 +649,35 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
                        global_secflags = CIFSSEC_MAX;
                        return count;
                } else if (!isdigit(c)) {
-                        cifs_dbg(VFS, "invalid flag %c\n", c);
+                        cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
+                                        flags_string);
                        return -EINVAL;
                }
        }
-        /* else we have a number */
-        flags = simple_strtoul(flags_string, NULL, 0);
+        /* else we have a number */
+        rc = kstrtouint(flags_string, 0, &flags);
+        if (rc) {
+                cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
+                                flags_string);
+                return rc;
+        }
        cifs_dbg(FYI, "sec flags 0x%x\n", flags);
-        if (flags <= 0)  {
+        if (flags == 0)  {
-                cifs_dbg(VFS, "invalid security flags %s\n", flags_string);
+                cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string);
                return -EINVAL;
        }
        if (flags & ~CIFSSEC_MASK) {
-                cifs_dbg(VFS, "attempt to set unsupported security flags 0x%x\n",
+                cifs_dbg(VFS, "Unsupported security flags: 0x%x\n",
                         flags & ~CIFSSEC_MASK);
                return -EINVAL;
        }
+        cifs_security_flags_handle_must_flags(&flags);
        /* flags look ok - update the global security flags for cifs module */
        global_secflags = flags;
        if (global_secflags & CIFSSEC_MUST_SIGN) {
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 4fb097468e21..fe8d6276410a 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -327,14 +327,14 @@ UniToupper(register wchar_t uc)
 /*
 * UniStrupr:  Upper case a unicode string
 */
-static inline wchar_t *
+static inline __le16 *
-UniStrupr(register wchar_t *upin)
+UniStrupr(register __le16 *upin)
 {
-        register wchar_t *up;
+        register __le16 *up;
        up = upin;
        while (*up) {           /* For all characters */
-                *up = UniToupper(*up);
+                *up = cpu_to_le16(UniToupper(le16_to_cpu(*up)));
                up++;
        }
        return upin;            /* Return input pointer */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 71436d1fca13..fc6f4f3a1a9d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/cifsencrypt.c
 *
- *   Copyright (C) International Business Machines  Corp., 2005,2006
+ *   Copyright (C) International Business Machines  Corp., 2005,2013
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -31,6 +31,37 @@
 #include <linux/random.h>
 #include <linux/highmem.h>
+static int
+cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
+{
+        int rc;
+        unsigned int size;
+        if (server->secmech.sdescmd5 != NULL)
+                return 0; /* already allocated */
+        server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
+        if (IS_ERR(server->secmech.md5)) {
+                cifs_dbg(VFS, "could not allocate crypto md5\n");
+                rc = PTR_ERR(server->secmech.md5);
+                server->secmech.md5 = NULL;
+                return rc;
+        }
+        size = sizeof(struct shash_desc) +
+                        crypto_shash_descsize(server->secmech.md5);
+        server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
+        if (!server->secmech.sdescmd5) {
+                crypto_free_shash(server->secmech.md5);
+                server->secmech.md5 = NULL;
+                return -ENOMEM;
+        }
+        server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
+        server->secmech.sdescmd5->shash.flags = 0x0;
+        return 0;
+}
 /*
 * Calculate and return the CIFS signature based on the mac key and SMB PDU.
 * The 16 byte signature must be allocated by the caller. Note we only use the
@@ -50,8 +81,11 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
                return -EINVAL;
        if (!server->secmech.sdescmd5) {
-                cifs_dbg(VFS, "%s: Can't generate signature\n", __func__);
+                rc = cifs_crypto_shash_md5_allocate(server);
-                return -1;
+                if (rc) {
+                        cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__);
+                        return -1;
+                }
        }
        rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
@@ -276,7 +310,6 @@ int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
                strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
        if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
-                memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
                memcpy(lnm_session_key, password_with_pad,
                        CIFS_ENCPWD_SIZE);
                return 0;
@@ -389,7 +422,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
                if (blobptr + attrsize > blobend)
                        break;
                if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
-                        if (!attrsize)
+                        if (!attrsize || attrsize >= CIFS_MAX_DOMAINNAME_LEN)
                                break;
                        if (!ses->domainName) {
                                ses->domainName =
@@ -414,7 +447,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
        int rc = 0;
        int len;
        char nt_hash[CIFS_NTHASH_SIZE];
-        wchar_t *user;
+        __le16 *user;
        wchar_t *domain;
        wchar_t *server;
@@ -439,7 +472,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
                return rc;
        }
-        /* convert ses->user_name to unicode and uppercase */
+        /* convert ses->user_name to unicode */
        len = ses->user_name ? strlen(ses->user_name) : 0;
        user = kmalloc(2 + (len * 2), GFP_KERNEL);
        if (user == NULL) {
@@ -448,7 +481,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
        }
        if (len) {
-                len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp);
+                len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
                UniStrupr(user);
        } else {
                memset(user, '\0', 2);
@@ -536,7 +569,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
                return rc;
        }
-        if (ses->server->secType == RawNTLMSSP)
+        if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)
                memcpy(ses->auth_key.response + offset,
                        ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
        else
@@ -557,6 +590,36 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
        return rc;
 }
+static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server)
+{
+        int rc;
+        unsigned int size;
+        /* check if already allocated */
+        if (server->secmech.sdeschmacmd5)
+                return 0;
+        server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
+        if (IS_ERR(server->secmech.hmacmd5)) {
+                cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
+                rc = PTR_ERR(server->secmech.hmacmd5);
+                server->secmech.hmacmd5 = NULL;
+                return rc;
+        }
+        size = sizeof(struct shash_desc) +
+                        crypto_shash_descsize(server->secmech.hmacmd5);
+        server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+        if (!server->secmech.sdeschmacmd5) {
+                crypto_free_shash(server->secmech.hmacmd5);
+                server->secmech.hmacmd5 = NULL;
+                return -ENOMEM;
+        }
+        server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
+        server->secmech.sdeschmacmd5->shash.flags = 0x0;
+        return 0;
+}
 int
 setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
@@ -568,7 +631,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
        char ntlmv2_hash[16];
        unsigned char *tiblob = NULL; /* target info blob */
-        if (ses->server->secType == RawNTLMSSP) {
+        if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
                if (!ses->domainName) {
                        rc = find_domain_name(ses, nls_cp);
                        if (rc) {
@@ -607,6 +670,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
        memcpy(ses->auth_key.response + baselen, tiblob, tilen);
+        rc = crypto_hmacmd5_alloc(ses->server);
+        if (rc) {
+                cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc);
+                goto setup_ntlmv2_rsp_ret;
+        }
        /* calculate ntlmv2_hash */
        rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
        if (rc) {
@@ -706,94 +775,32 @@ calc_seckey(struct cifs_ses *ses)
 void
 cifs_crypto_shash_release(struct TCP_Server_Info *server)
 {
-        if (server->secmech.hmacsha256)
+        if (server->secmech.cmacaes) {
-                crypto_free_shash(server->secmech.hmacsha256);
+                crypto_free_shash(server->secmech.cmacaes);
+                server->secmech.cmacaes = NULL;
-        if (server->secmech.md5)
-                crypto_free_shash(server->secmech.md5);
-        if (server->secmech.hmacmd5)
-                crypto_free_shash(server->secmech.hmacmd5);
-        kfree(server->secmech.sdeschmacsha256);
-        kfree(server->secmech.sdeschmacmd5);
-        kfree(server->secmech.sdescmd5);
-}
-int
-cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
-{
-        int rc;
-        unsigned int size;
-        server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
-        if (IS_ERR(server->secmech.hmacmd5)) {
-                cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
-                return PTR_ERR(server->secmech.hmacmd5);
-        }
-        server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
-        if (IS_ERR(server->secmech.md5)) {
-                cifs_dbg(VFS, "could not allocate crypto md5\n");
-                rc = PTR_ERR(server->secmech.md5);
-                goto crypto_allocate_md5_fail;
        }
-        server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
+        if (server->secmech.hmacsha256) {
-        if (IS_ERR(server->secmech.hmacsha256)) {
+                crypto_free_shash(server->secmech.hmacsha256);
-                cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
+                server->secmech.hmacsha256 = NULL;
-                rc = PTR_ERR(server->secmech.hmacsha256);
-                goto crypto_allocate_hmacsha256_fail;
-        }
-        size = sizeof(struct shash_desc) +
-                        crypto_shash_descsize(server->secmech.hmacmd5);
-        server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
-        if (!server->secmech.sdeschmacmd5) {
-                rc = -ENOMEM;
-                goto crypto_allocate_hmacmd5_sdesc_fail;
        }
-        server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
-        server->secmech.sdeschmacmd5->shash.flags = 0x0;
-        size = sizeof(struct shash_desc) +
+        if (server->secmech.md5) {
-                        crypto_shash_descsize(server->secmech.md5);
+                crypto_free_shash(server->secmech.md5);
-        server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
+                server->secmech.md5 = NULL;
-        if (!server->secmech.sdescmd5) {
-                rc = -ENOMEM;
-                goto crypto_allocate_md5_sdesc_fail;
        }
-        server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
-        server->secmech.sdescmd5->shash.flags = 0x0;
-        size = sizeof(struct shash_desc) +
+        if (server->secmech.hmacmd5) {
-                        crypto_shash_descsize(server->secmech.hmacsha256);
+                crypto_free_shash(server->secmech.hmacmd5);
-        server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
+                server->secmech.hmacmd5 = NULL;
-        if (!server->secmech.sdeschmacsha256) {
-                rc = -ENOMEM;
-                goto crypto_allocate_hmacsha256_sdesc_fail;
        }
-        server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
-        server->secmech.sdeschmacsha256->shash.flags = 0x0;
-        return 0;
-crypto_allocate_hmacsha256_sdesc_fail:
-        kfree(server->secmech.sdescmd5);
-crypto_allocate_md5_sdesc_fail:
+        kfree(server->secmech.sdesccmacaes);
+        server->secmech.sdesccmacaes = NULL;
+        kfree(server->secmech.sdeschmacsha256);
+        server->secmech.sdeschmacsha256 = NULL;
        kfree(server->secmech.sdeschmacmd5);
+        server->secmech.sdeschmacmd5 = NULL;
-crypto_allocate_hmacmd5_sdesc_fail:
+        kfree(server->secmech.sdescmd5);
-        crypto_free_shash(server->secmech.hmacsha256);
+        server->secmech.sdescmd5 = NULL;
-crypto_allocate_hmacsha256_fail:
-        crypto_free_shash(server->secmech.md5);
-crypto_allocate_md5_fail:
-        crypto_free_shash(server->secmech.hmacmd5);
-        return rc;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 3752b9f6d9e4..85ea98d139fc 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -147,18 +147,17 @@ cifs_read_super(struct super_block *sb)
                goto out_no_root;
        }
+        if (cifs_sb_master_tcon(cifs_sb)->nocase)
+                sb->s_d_op = &cifs_ci_dentry_ops;
+        else
+                sb->s_d_op = &cifs_dentry_ops;
        sb->s_root = d_make_root(inode);
        if (!sb->s_root) {
                rc = -ENOMEM;
                goto out_no_root;
        }
-        /* do that *after* d_make_root() - we want NULL ->d_op for root here */
-        if (cifs_sb_master_tcon(cifs_sb)->nocase)
-                sb->s_d_op = &cifs_ci_dentry_ops;
-        else
-                sb->s_d_op = &cifs_dentry_ops;
 #ifdef CONFIG_CIFS_NFSD_EXPORT
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                cifs_dbg(FYI, "export ops supported\n");
@@ -312,11 +311,14 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 }
 static void
-cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
+cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
 {
+        if (ses->sectype == Unspecified)
+                return;
        seq_printf(s, ",sec=");
-        switch (server->secType) {
+        switch (ses->sectype) {
        case LANMAN:
                seq_printf(s, "lanman");
                break;
@@ -338,7 +340,7 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
                break;
        }
-        if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+        if (ses->sign)
                seq_printf(s, "i");
 }
@@ -369,7 +371,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
        srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
        seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
-        cifs_show_security(s, tcon->ses->server);
+        cifs_show_security(s, tcon->ses);
        cifs_show_cache_flavor(s, cifs_sb);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
@@ -765,7 +767,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
-        /* note that this is called by vfs setlease with lock_flocks held
+        /* note that this is called by vfs setlease with i_lock held
           to protect *lease from going away */
        struct inode *inode = file_inode(file);
        struct cifsFileInfo *cfile = file->private_data;
@@ -968,7 +970,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 };
 const struct file_operations cifs_dir_ops = {
-        .readdir = cifs_readdir,
+        .iterate = cifs_readdir,
        .release = cifs_closedir,
        .read    = generic_read_dir,
        .unlocked_ioctl  = cifs_ioctl,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0e32c3446ce9..ea723a5e8226 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -101,7 +101,7 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
-extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
+extern int cifs_readdir(struct file *file, struct dir_context *ctx);
 /* Functions related to dir entries */
 extern const struct dentry_operations cifs_dentry_ops;
@@ -132,5 +132,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION   "2.0"
+#define CIFS_VERSION   "2.01"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4f07f6fbe494..52ca861ed35e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -44,6 +44,7 @@
 #define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
 #define MAX_SERVER_SIZE 15
 #define MAX_SHARE_SIZE 80
+#define CIFS_MAX_DOMAINNAME_LEN 256 /* max domain name length */
 #define MAX_USERNAME_SIZE 256   /* reasonable maximum for current servers */
 #define MAX_PASSWORD_SIZE 512   /* max for windows seems to be 256 wide chars */
@@ -101,20 +102,14 @@ enum statusEnum {
 };
 enum securityEnum {
-        LANMAN = 0,                     /* Legacy LANMAN auth */
+        Unspecified = 0,        /* not specified */
+        LANMAN,                 /* Legacy LANMAN auth */
        NTLM,                   /* Legacy NTLM012 auth with NTLM hash */
        NTLMv2,                 /* Legacy NTLM auth with NTLMv2 hash */
        RawNTLMSSP,             /* NTLMSSP without SPNEGO, NTLMv2 hash */
-/*      NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
        Kerberos,               /* Kerberos via SPNEGO */
 };
-enum protocolEnum {
-        TCP = 0,
-        SCTP
-        /* Netbios frames protocol not supported at this time */
-};
 struct session_key {
        unsigned int len;
        char *response;
@@ -131,9 +126,11 @@ struct cifs_secmech {
        struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
        struct crypto_shash *md5; /* md5 hash function */
        struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */
+        struct crypto_shash *cmacaes; /* block-cipher based MAC function */
        struct sdesc *sdeschmacmd5;  /* ctxt to generate ntlmv2 hash, CR1 */
        struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
        struct sdesc *sdeschmacsha256;  /* ctxt to generate smb2 signature */
+        struct sdesc *sdesccmacaes;  /* ctxt to generate smb3 signature */
 };
 /* per smb session structure/fields */
@@ -181,6 +178,7 @@ enum smb_version {
        Smb_20,
        Smb_21,
        Smb_30,
+        Smb_302,
 };
 struct mid_q_entry;
@@ -197,6 +195,7 @@ struct cifs_writedata;
 struct cifs_io_parms;
 struct cifs_search_info;
 struct cifsInodeInfo;
+struct cifs_open_parms;
 struct smb_version_operations {
        int (*send_cancel)(struct TCP_Server_Info *, void *,
@@ -228,6 +227,7 @@ struct smb_version_operations {
        void (*dump_detail)(void *);
        void (*clear_stats)(struct cifs_tcon *);
        void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
+        void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
        /* verify the message */
        int (*check_message)(char *, unsigned int);
        bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
@@ -309,9 +309,8 @@ struct smb_version_operations {
                               const char *, const char *,
                               struct cifs_sb_info *);
        /* open a file for non-posix mounts */
-        int (*open)(const unsigned int, struct cifs_tcon *, const char *, int,
+        int (*open)(const unsigned int, struct cifs_open_parms *,
-                    int, int, struct cifs_fid *, __u32 *, FILE_ALL_INFO *,
+                    __u32 *, FILE_ALL_INFO *);
-                    struct cifs_sb_info *);
        /* set fid protocol-specific info */
        void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32);
        /* close a file */
@@ -367,8 +366,13 @@ struct smb_version_operations {
        void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
        /* generate new lease key */
        void (*new_lease_key)(struct cifs_fid *fid);
+        /* The next two functions will need to be changed to per smb session */
+        void (*generate_signingkey)(struct TCP_Server_Info *server);
        int (*calc_signature)(struct smb_rqst *rqst,
                                   struct TCP_Server_Info *server);
+        int (*query_mf_symlink)(const unsigned char *path, char *pbuf,
+                        unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
+                        unsigned int xid);
 };
 struct smb_version_values {
@@ -387,6 +391,8 @@ struct smb_version_values {
        unsigned int    cap_nt_find;
        unsigned int    cap_large_files;
        unsigned int    oplock_read;
+        __u16           signing_enabled;
+        __u16           signing_required;
 };
 #define HEADER_SIZE(server) (server->vals->header_size)
@@ -407,7 +413,8 @@ struct smb_vol {
        kgid_t backupgid;
        umode_t file_mode;
        umode_t dir_mode;
-        unsigned secFlg;
+        enum securityEnum sectype; /* sectype requested via mnt opts */
+        bool sign; /* was signing requested via mnt opts? */
        bool retry:1;
        bool intr:1;
        bool setuids:1;
@@ -441,6 +448,7 @@ struct smb_vol {
        bool mfsymlinks:1; /* use Minshall+French Symlinks */
        bool multiuser:1;
        bool rwpidforward:1; /* pid forward for read/write operations */
+        bool nosharesock;
        unsigned int rsize;
        unsigned int wsize;
        bool sockopt_tcp_nodelay:1;
@@ -514,6 +522,7 @@ struct TCP_Server_Info {
        struct task_struct *tsk;
        char server_GUID[16];
        __u16 sec_mode;
+        bool sign; /* is signing enabled on this connection? */
        bool session_estab; /* mark when very first sess is established */
 #ifdef CONFIG_CIFS_SMB2
        int echo_credits;  /* echo reserved slots */
@@ -521,7 +530,6 @@ struct TCP_Server_Info {
        bool echoes:1; /* enable echoes */
 #endif
        u16 dialect; /* dialect index that server chose */
-        enum securityEnum secType;
        bool oplocks:1; /* enable oplocks */
        unsigned int maxReq;    /* Clients should submit no more */
        /* than maxReq distinct unanswered SMBs to the server when using  */
@@ -540,12 +548,17 @@ struct TCP_Server_Info {
        int timeAdj;  /* Adjust for difference in server time zone in sec */
        __u64 CurrentMid;         /* multiplex id - rotating counter */
        char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
+        char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
        /* 16th byte of RFC1001 workstation name is always null */
        char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
        __u32 sequence_number; /* for signing, protected by srv_mutex */
        struct session_key session_key;
        unsigned long lstrp; /* when we got last response from this server */
        struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
+#define CIFS_NEGFLAVOR_LANMAN   0       /* wct == 13, LANMAN */
+#define CIFS_NEGFLAVOR_UNENCAP  1       /* wct == 17, but no ext_sec */
+#define CIFS_NEGFLAVOR_EXTENDED 2       /* wct == 17, ext_sec bit set */
+        char    negflavor;      /* NEGOTIATE response flavor */
        /* extended security flavors that server supports */
        bool    sec_ntlmssp;            /* supports NTLMSSP */
        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
@@ -697,7 +710,6 @@ struct cifs_ses {
        enum statusEnum status;
        unsigned overrideSecFlg;  /* if non-zero override global sec flags */
        __u16 ipc_tid;          /* special tid for connection to IPC share */
-        __u16 flags;
        __u16 vcnum;
        char *serverOS;         /* name of operating system underlying server */
        char *serverNOS;        /* name of network operating system of server */
@@ -714,21 +726,14 @@ struct cifs_ses {
        char *password;
        struct session_key auth_key;
        struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
+        enum securityEnum sectype; /* what security flavor was specified? */
+        bool sign;              /* is signing required? */
        bool need_reconnect:1; /* connection reset, uid now invalid */
 #ifdef CONFIG_CIFS_SMB2
        __u16 session_flags;
 #endif /* CONFIG_CIFS_SMB2 */
 };
-/* no more than one of the following three session flags may be set */
-#define CIFS_SES_NT4 1
-#define CIFS_SES_OS2 2
-#define CIFS_SES_W9X 4
-/* following flag is set for old servers such as OS2 (and Win95?)
-   which do not negotiate NTLM or POSIX dialects, but instead
-   negotiate one of the older LANMAN dialects */
-#define CIFS_SES_LANMAN 8
 static inline bool
 cap_unix(struct cifs_ses *ses)
 {
@@ -816,7 +821,7 @@ struct cifs_tcon {
 #ifdef CONFIG_CIFS_SMB2
        bool print:1;           /* set if connection to printer share */
        bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
-        __u32 capabilities;
+        __le32 capabilities;
        __u32 share_flags;
        __u32 maximal_access;
        __u32 vol_serial_number;
@@ -911,6 +916,17 @@ struct cifs_search_info {
        bool smallBuf:1; /* so we know which buf_release function to call */
 };
+struct cifs_open_parms {
+        struct cifs_tcon *tcon;
+        struct cifs_sb_info *cifs_sb;
+        int disposition;
+        int desired_access;
+        int create_options;
+        const char *path;
+        struct cifs_fid *fid;
+        bool reconnect:1;
+};
 struct cifs_fid {
        __u16 netfid;
 #ifdef CONFIG_CIFS_SMB2
@@ -1348,7 +1364,7 @@ require use of the stronger protocol */
 #define   CIFSSEC_MUST_SEAL     0x40040 /* not supported yet */
 #define   CIFSSEC_MUST_NTLMSSP  0x80080 /* raw ntlmssp with ntlmv2 */
-#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP)
+#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
 #define   CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
 #define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
 /*
@@ -1494,4 +1510,7 @@ extern struct smb_version_values smb21_values;
 #define SMB30_VERSION_STRING    "3.0"
 extern struct smb_version_operations smb30_operations;
 extern struct smb_version_values smb30_values;
+#define SMB302_VERSION_STRING   "3.02"
+/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
+extern struct smb_version_values smb302_values;
 #endif  /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index e996ff6b26d1..11ca24a8e054 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -142,6 +142,11 @@
 */
 #define CIFS_SESS_KEY_SIZE (16)
+/*
+ * Size of the smb3 signing key
+ */
+#define SMB3_SIGN_KEY_SIZE (16)
 #define CIFS_CLIENT_CHALLENGE_SIZE (8)
 #define CIFS_SERVER_CHALLENGE_SIZE (8)
 #define CIFS_HMAC_MD5_HASH_SIZE (16)
@@ -531,7 +536,7 @@ typedef struct lanman_neg_rsp {
 #define READ_RAW_ENABLE 1
 #define WRITE_RAW_ENABLE 2
 #define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE)
+#define SMB1_CLIENT_GUID_SIZE (16)
 typedef struct negotiate_rsp {
        struct smb_hdr hdr;     /* wct = 17 */
        __le16 DialectIndex; /* 0xFFFF = no dialect acceptable */
@@ -553,7 +558,7 @@ typedef struct negotiate_rsp {
                /* followed by 16 bytes of server GUID */
                /* then security blob if cap_extended_security negotiated */
                struct {
-                        unsigned char GUID[16];
+                        unsigned char GUID[SMB1_CLIENT_GUID_SIZE];
                        unsigned char SecurityBlob[1];
                } __attribute__((packed)) extended_response;
        } __attribute__((packed)) u;
@@ -1315,6 +1320,14 @@ typedef struct smb_com_ntransact_rsp {
        /* parms and data follow */
 } __attribute__((packed)) NTRANSACT_RSP;
+/* See MS-SMB 2.2.7.2.1.1 */
+struct srv_copychunk {
+        __le64 SourceOffset;
+        __le64 DestinationOffset;
+        __le32 CopyLength;
+        __u32  Reserved;
+} __packed;
 typedef struct smb_com_transaction_ioctl_req {
        struct smb_hdr hdr;     /* wct = 23 */
        __u8 MaxSetupCount;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index dda188a94332..b29a012bed33 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -118,6 +118,8 @@ extern void header_assemble(struct smb_hdr *, char /* command */ ,
 extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
                                struct cifs_ses *ses,
                                void **request_buf);
+extern enum securityEnum select_sectype(struct TCP_Server_Info *server,
+                                enum securityEnum requested);
 extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
                          const struct nls_table *nls_cp);
 extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -212,6 +214,7 @@ extern int cifs_negotiate_protocol(const unsigned int xid,
                                   struct cifs_ses *ses);
 extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
                              struct nls_table *nls_info);
+extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required);
 extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses);
 extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
@@ -430,9 +433,9 @@ extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
                        const struct nls_table *);
 extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
 extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
-extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
 extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
 extern int calc_seckey(struct cifs_ses *);
+extern void generate_smb3signingkey(struct TCP_Server_Info *);
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern int calc_lanman_hash(const char *password, const char *cryptkey,
@@ -494,5 +497,7 @@ void cifs_writev_complete(struct work_struct *work);
 struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
                                                work_func_t complete);
 void cifs_writedata_release(struct kref *refcount);
+int open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
+                        unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
+                        unsigned int xid);
 #endif                  /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a58dc77cc443..a89c4cb4e6cf 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -367,6 +367,185 @@ vt2_err:
        return -EINVAL;
 }
+static int
+decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr)
+{
+        int     rc = 0;
+        u16     count;
+        char    *guid = pSMBr->u.extended_response.GUID;
+        struct TCP_Server_Info *server = ses->server;
+        count = get_bcc(&pSMBr->hdr);
+        if (count < SMB1_CLIENT_GUID_SIZE)
+                return -EIO;
+        spin_lock(&cifs_tcp_ses_lock);
+        if (server->srv_count > 1) {
+                spin_unlock(&cifs_tcp_ses_lock);
+                if (memcmp(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE) != 0) {
+                        cifs_dbg(FYI, "server UID changed\n");
+                        memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
+                }
+        } else {
+                spin_unlock(&cifs_tcp_ses_lock);
+                memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
+        }
+        if (count == SMB1_CLIENT_GUID_SIZE) {
+                server->sec_ntlmssp = true;
+        } else {
+                count -= SMB1_CLIENT_GUID_SIZE;
+                rc = decode_negTokenInit(
+                        pSMBr->u.extended_response.SecurityBlob, count, server);
+                if (rc != 1)
+                        return -EINVAL;
+        }
+        return 0;
+}
+int
+cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
+{
+        bool srv_sign_required = server->sec_mode & server->vals->signing_required;
+        bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled;
+        bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN;
+        /*
+         * Is signing required by mnt options? If not then check
+         * global_secflags to see if it is there.
+         */
+        if (!mnt_sign_required)
+                mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) ==
+                                                CIFSSEC_MUST_SIGN);
+        /*
+         * If signing is required then it's automatically enabled too,
+         * otherwise, check to see if the secflags allow it.
+         */
+        mnt_sign_enabled = mnt_sign_required ? mnt_sign_required :
+                                (global_secflags & CIFSSEC_MAY_SIGN);
+        /* If server requires signing, does client allow it? */
+        if (srv_sign_required) {
+                if (!mnt_sign_enabled) {
+                        cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!");
+                        return -ENOTSUPP;
+                }
+                server->sign = true;
+        }
+        /* If client requires signing, does server allow it? */
+        if (mnt_sign_required) {
+                if (!srv_sign_enabled) {
+                        cifs_dbg(VFS, "Server does not support signing!");
+                        return -ENOTSUPP;
+                }
+                server->sign = true;
+        }
+        return 0;
+}
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+static int
+decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
+{
+        __s16 tmp;
+        struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
+        if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT)
+                return -EOPNOTSUPP;
+        server->sec_mode = le16_to_cpu(rsp->SecurityMode);
+        server->maxReq = min_t(unsigned int,
+                               le16_to_cpu(rsp->MaxMpxCount),
+                               cifs_max_pending);
+        set_credits(server, server->maxReq);
+        server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
+        server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
+        /* even though we do not use raw we might as well set this
+        accurately, in case we ever find a need for it */
+        if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
+                server->max_rw = 0xFF00;
+                server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
+        } else {
+                server->max_rw = 0;/* do not need to use raw anyway */
+                server->capabilities = CAP_MPX_MODE;
+        }
+        tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
+        if (tmp == -1) {
+                /* OS/2 often does not set timezone therefore
+                 * we must use server time to calc time zone.
+                 * Could deviate slightly from the right zone.
+                 * Smallest defined timezone difference is 15 minutes
+                 * (i.e. Nepal).  Rounding up/down is done to match
+                 * this requirement.
+                 */
+                int val, seconds, remain, result;
+                struct timespec ts, utc;
+                utc = CURRENT_TIME;
+                ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
+                                    rsp->SrvTime.Time, 0);
+                cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
+                         (int)ts.tv_sec, (int)utc.tv_sec,
+                         (int)(utc.tv_sec - ts.tv_sec));
+                val = (int)(utc.tv_sec - ts.tv_sec);
+                seconds = abs(val);
+                result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
+                remain = seconds % MIN_TZ_ADJ;
+                if (remain >= (MIN_TZ_ADJ / 2))
+                        result += MIN_TZ_ADJ;
+                if (val < 0)
+                        result = -result;
+                server->timeAdj = result;
+        } else {
+                server->timeAdj = (int)tmp;
+                server->timeAdj *= 60; /* also in seconds */
+        }
+        cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
+        /* BB get server time for time conversions and add
+        code to use it and timezone since this is not UTC */
+        if (rsp->EncryptionKeyLength ==
+                        cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
+                memcpy(server->cryptkey, rsp->EncryptionKey,
+                        CIFS_CRYPTO_KEY_SIZE);
+        } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
+                return -EIO; /* need cryptkey unless plain text */
+        }
+        cifs_dbg(FYI, "LANMAN negotiated\n");
+        return 0;
+}
+#else
+static inline int
+decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
+{
+        cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n");
+        return -EOPNOTSUPP;
+}
+#endif
+static bool
+should_set_ext_sec_flag(enum securityEnum sectype)
+{
+        switch (sectype) {
+        case RawNTLMSSP:
+        case Kerberos:
+                return true;
+        case Unspecified:
+                if (global_secflags &
+                    (CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP))
+                        return true;
+                /* Fallthrough */
+        default:
+                return false;
+        }
+}
 int
 CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
 {
@@ -375,41 +554,24 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
        int rc = 0;
        int bytes_returned;
        int i;
-        struct TCP_Server_Info *server;
+        struct TCP_Server_Info *server = ses->server;
        u16 count;
-        unsigned int secFlags;
-        if (ses->server)
+        if (!server) {
-                server = ses->server;
+                WARN(1, "%s: server is NULL!\n", __func__);
-        else {
+                return -EIO;
-                rc = -EIO;
-                return rc;
        }
        rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ ,
                      (void **) &pSMB, (void **) &pSMBr);
        if (rc)
                return rc;
-        /* if any of auth flags (ie not sign or seal) are overriden use them */
-        if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
-                secFlags = ses->overrideSecFlg;  /* BB FIXME fix sign flags? */
-        else /* if override flags set only sign/seal OR them with global auth */
-                secFlags = global_secflags | ses->overrideSecFlg;
-        cifs_dbg(FYI, "secFlags 0x%x\n", secFlags);
        pSMB->hdr.Mid = get_next_mid(server);
        pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
-        if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
+        if (should_set_ext_sec_flag(ses->sectype)) {
-                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                cifs_dbg(FYI, "Requesting extended security.");
-        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
-                cifs_dbg(FYI, "Kerberos only mechanism, enable extended security\n");
-                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-        } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
-                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
-                cifs_dbg(FYI, "NTLMSSP only mechanism, enable extended security\n");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        }
@@ -436,127 +598,21 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
                could not negotiate a common dialect */
                rc = -EOPNOTSUPP;
                goto neg_err_exit;
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-        } else if ((pSMBr->hdr.WordCount == 13)
-                        && ((server->dialect == LANMAN_PROT)
-                                || (server->dialect == LANMAN2_PROT))) {
-                __s16 tmp;
-                struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
-                if ((secFlags & CIFSSEC_MAY_LANMAN) ||
-                        (secFlags & CIFSSEC_MAY_PLNTXT))
-                        server->secType = LANMAN;
-                else {
-                        cifs_dbg(VFS, "mount failed weak security disabled in /proc/fs/cifs/SecurityFlags\n");
-                        rc = -EOPNOTSUPP;
-                        goto neg_err_exit;
-                }
-                server->sec_mode = le16_to_cpu(rsp->SecurityMode);
-                server->maxReq = min_t(unsigned int,
-                                       le16_to_cpu(rsp->MaxMpxCount),
-                                       cifs_max_pending);
-                set_credits(server, server->maxReq);
-                server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
-                server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
-                /* even though we do not use raw we might as well set this
-                accurately, in case we ever find a need for it */
-                if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
-                        server->max_rw = 0xFF00;
-                        server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
-                } else {
-                        server->max_rw = 0;/* do not need to use raw anyway */
-                        server->capabilities = CAP_MPX_MODE;
-                }
-                tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
-                if (tmp == -1) {
-                        /* OS/2 often does not set timezone therefore
-                         * we must use server time to calc time zone.
-                         * Could deviate slightly from the right zone.
-                         * Smallest defined timezone difference is 15 minutes
-                         * (i.e. Nepal).  Rounding up/down is done to match
-                         * this requirement.
-                         */
-                        int val, seconds, remain, result;
-                        struct timespec ts, utc;
-                        utc = CURRENT_TIME;
-                        ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
-                                            rsp->SrvTime.Time, 0);
-                        cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
-                                 (int)ts.tv_sec, (int)utc.tv_sec,
-                                 (int)(utc.tv_sec - ts.tv_sec));
-                        val = (int)(utc.tv_sec - ts.tv_sec);
-                        seconds = abs(val);
-                        result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
-                        remain = seconds % MIN_TZ_ADJ;
-                        if (remain >= (MIN_TZ_ADJ / 2))
-                                result += MIN_TZ_ADJ;
-                        if (val < 0)
-                                result = -result;
-                        server->timeAdj = result;
-                } else {
-                        server->timeAdj = (int)tmp;
-                        server->timeAdj *= 60; /* also in seconds */
-                }
-                cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
-                /* BB get server time for time conversions and add
-                code to use it and timezone since this is not UTC */
-                if (rsp->EncryptionKeyLength ==
-                                cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
-                        memcpy(ses->server->cryptkey, rsp->EncryptionKey,
-                                CIFS_CRYPTO_KEY_SIZE);
-                } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
-                        rc = -EIO; /* need cryptkey unless plain text */
-                        goto neg_err_exit;
-                }
-                cifs_dbg(FYI, "LANMAN negotiated\n");
-                /* we will not end up setting signing flags - as no signing
-                was in LANMAN and server did not return the flags on */
-                goto signing_check;
-#else /* weak security disabled */
        } else if (pSMBr->hdr.WordCount == 13) {
-                cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n");
+                server->negflavor = CIFS_NEGFLAVOR_LANMAN;
-                rc = -EOPNOTSUPP;
+                rc = decode_lanman_negprot_rsp(server, pSMBr);
-#endif /* WEAK_PW_HASH */
+                goto signing_check;
-                goto neg_err_exit;
        } else if (pSMBr->hdr.WordCount != 17) {
                /* unknown wct */
                rc = -EOPNOTSUPP;
                goto neg_err_exit;
        }
-        /* else wct == 17 NTLM */
+        /* else wct == 17, NTLM or better */
        server->sec_mode = pSMBr->SecurityMode;
        if ((server->sec_mode & SECMODE_USER) == 0)
                cifs_dbg(FYI, "share mode security\n");
-        if ((server->sec_mode & SECMODE_PW_ENCRYPT) == 0)
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-                if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
-#endif /* CIFS_WEAK_PW_HASH */
-                        cifs_dbg(VFS, "Server requests plain text password but client support disabled\n");
-        if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
-                server->secType = NTLMv2;
-        else if (secFlags & CIFSSEC_MAY_NTLM)
-                server->secType = NTLM;
-        else if (secFlags & CIFSSEC_MAY_NTLMV2)
-                server->secType = NTLMv2;
-        else if (secFlags & CIFSSEC_MAY_KRB5)
-                server->secType = Kerberos;
-        else if (secFlags & CIFSSEC_MAY_NTLMSSP)
-                server->secType = RawNTLMSSP;
-        else if (secFlags & CIFSSEC_MAY_LANMAN)
-                server->secType = LANMAN;
-        else {
-                rc = -EOPNOTSUPP;
-                cifs_dbg(VFS, "Invalid security type\n");
-                goto neg_err_exit;
-        }
-        /* else ... any others ...? */
        /* one byte, so no need to convert this or EncryptionKeyLen from
           little endian */
        server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount),
@@ -569,90 +625,26 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
        server->capabilities = le32_to_cpu(pSMBr->Capabilities);
        server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
        server->timeAdj *= 60;
        if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
+                server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
                memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
                       CIFS_CRYPTO_KEY_SIZE);
        } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
                        server->capabilities & CAP_EXTENDED_SECURITY) &&
                                (pSMBr->EncryptionKeyLength == 0)) {
-                /* decode security blob */
+                server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
-                count = get_bcc(&pSMBr->hdr);
+                rc = decode_ext_sec_blob(ses, pSMBr);
-                if (count < 16) {
-                        rc = -EIO;
-                        goto neg_err_exit;
-                }
-                spin_lock(&cifs_tcp_ses_lock);
-                if (server->srv_count > 1) {
-                        spin_unlock(&cifs_tcp_ses_lock);
-                        if (memcmp(server->server_GUID,
-                                   pSMBr->u.extended_response.
-                                   GUID, 16) != 0) {
-                                cifs_dbg(FYI, "server UID changed\n");
-                                memcpy(server->server_GUID,
-                                        pSMBr->u.extended_response.GUID,
-                                        16);
-                        }
-                } else {
-                        spin_unlock(&cifs_tcp_ses_lock);
-                        memcpy(server->server_GUID,
-                               pSMBr->u.extended_response.GUID, 16);
-                }
-                if (count == 16) {
-                        server->secType = RawNTLMSSP;
-                } else {
-                        rc = decode_negTokenInit(pSMBr->u.extended_response.
-                                                 SecurityBlob, count - 16,
-                                                 server);
-                        if (rc == 1)
-                                rc = 0;
-                        else
-                                rc = -EINVAL;
-                        if (server->secType == Kerberos) {
-                                if (!server->sec_kerberos &&
-                                                !server->sec_mskerberos)
-                                        rc = -EOPNOTSUPP;
-                        } else if (server->secType == RawNTLMSSP) {
-                                if (!server->sec_ntlmssp)
-                                        rc = -EOPNOTSUPP;
-                        } else
-                                        rc = -EOPNOTSUPP;
-                }
        } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
                rc = -EIO; /* no crypt key only if plain text pwd */
-                goto neg_err_exit;
-        } else
-                server->capabilities &= ~CAP_EXTENDED_SECURITY;
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-signing_check:
-#endif
-        if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
-                /* MUST_SIGN already includes the MAY_SIGN FLAG
-                   so if this is zero it means that signing is disabled */
-                cifs_dbg(FYI, "Signing disabled\n");
-                if (server->sec_mode & SECMODE_SIGN_REQUIRED) {
-                        cifs_dbg(VFS, "Server requires packet signing to be enabled in /proc/fs/cifs/SecurityFlags\n");
-                        rc = -EOPNOTSUPP;
-                }
-                server->sec_mode &=
-                        ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
-        } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
-                /* signing required */
-                cifs_dbg(FYI, "Must sign - secFlags 0x%x\n", secFlags);
-                if ((server->sec_mode &
-                        (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
-                        cifs_dbg(VFS, "signing required but server lacks support\n");
-                        rc = -EOPNOTSUPP;
-                } else
-                        server->sec_mode |= SECMODE_SIGN_REQUIRED;
        } else {
-                /* signing optional ie CIFSSEC_MAY_SIGN */
+                server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
-                if ((server->sec_mode & SECMODE_SIGN_REQUIRED) == 0)
+                server->capabilities &= ~CAP_EXTENDED_SECURITY;
-                        server->sec_mode &=
-                                ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
        }
+signing_check:
+        if (!rc)
+                rc = cifs_enable_signing(server, ses->sign);
 neg_err_exit:
        cifs_buf_release(pSMB);
@@ -777,9 +769,8 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses)
        pSMB->hdr.Mid = get_next_mid(ses->server);
-        if (ses->server->sec_mode &
+        if (ses->server->sign)
-                   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+                pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-                        pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
        pSMB->hdr.Uid = ses->Suid;
@@ -1540,8 +1531,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
        switch (mid->mid_state) {
        case MID_RESPONSE_RECEIVED:
                /* result already set, check signature */
-                if (server->sec_mode &
+                if (server->sign) {
-                    (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                        int rc = 0;
                        rc = cifs_verify_signature(&rqst, server,
@@ -3940,6 +3930,7 @@ QFileInfoRetry:
        pSMB->Pad = 0;
        pSMB->Fid = netfid;
        inc_rfc1001_len(pSMB, byte_count);
+        pSMB->t2.ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -4108,6 +4099,7 @@ UnixQFileInfoRetry:
        pSMB->Pad = 0;
        pSMB->Fid = netfid;
        inc_rfc1001_len(pSMB, byte_count);
+        pSMB->t2.ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -4794,11 +4786,8 @@ getDFSRetry:
                strncpy(pSMB->RequestFileName, search_name, name_len);
        }
-        if (ses->server) {
+        if (ses->server && ses->server->sign)
-                if (ses->server->sec_mode &
+                pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-                   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-                        pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-        }
        pSMB->hdr.Uid = ses->Suid;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e3bc39bb9d12..d67c550c4980 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -85,7 +85,7 @@ enum {
        Opt_acl, Opt_noacl, Opt_locallease,
        Opt_sign, Opt_seal, Opt_noac,
        Opt_fsc, Opt_mfsymlinks,
-        Opt_multiuser, Opt_sloppy,
+        Opt_multiuser, Opt_sloppy, Opt_nosharesock,
        /* Mount options which take numeric value */
        Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -165,6 +165,7 @@ static const match_table_t cifs_mount_option_tokens = {
        { Opt_mfsymlinks, "mfsymlinks" },
        { Opt_multiuser, "multiuser" },
        { Opt_sloppy, "sloppy" },
+        { Opt_nosharesock, "nosharesock" },
        { Opt_backupuid, "backupuid=%s" },
        { Opt_backupgid, "backupgid=%s" },
@@ -275,6 +276,7 @@ static const match_table_t cifs_smb_version_tokens = {
        { Smb_20, SMB20_VERSION_STRING},
        { Smb_21, SMB21_VERSION_STRING },
        { Smb_30, SMB30_VERSION_STRING },
+        { Smb_302, SMB302_VERSION_STRING },
 };
 static int ip_connect(struct TCP_Server_Info *server);
@@ -1024,44 +1026,48 @@ static int cifs_parse_security_flavors(char *value,
        substring_t args[MAX_OPT_ARGS];
+        /*
+         * With mount options, the last one should win. Reset any existing
+         * settings back to default.
+         */
+        vol->sectype = Unspecified;
+        vol->sign = false;
        switch (match_token(value, cifs_secflavor_tokens, args)) {
-        case Opt_sec_krb5:
-                vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_SIGN;
-                break;
-        case Opt_sec_krb5i:
-                vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN;
-                break;
        case Opt_sec_krb5p:
-                /* vol->secFlg |= CIFSSEC_MUST_SEAL | CIFSSEC_MAY_KRB5; */
+                cifs_dbg(VFS, "sec=krb5p is not supported!\n");
-                cifs_dbg(VFS, "Krb5 cifs privacy not supported\n");
+                return 1;
-                break;
+        case Opt_sec_krb5i:
-        case Opt_sec_ntlmssp:
+                vol->sign = true;
-                vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
+                /* Fallthrough */
+        case Opt_sec_krb5:
+                vol->sectype = Kerberos;
                break;
        case Opt_sec_ntlmsspi:
-                vol->secFlg |= CIFSSEC_MAY_NTLMSSP | CIFSSEC_MUST_SIGN;
+                vol->sign = true;
-                break;
+                /* Fallthrough */
-        case Opt_ntlm:
+        case Opt_sec_ntlmssp:
-                /* ntlm is default so can be turned off too */
+                vol->sectype = RawNTLMSSP;
-                vol->secFlg |= CIFSSEC_MAY_NTLM;
                break;
        case Opt_sec_ntlmi:
-                vol->secFlg |= CIFSSEC_MAY_NTLM | CIFSSEC_MUST_SIGN;
+                vol->sign = true;
-                break;
+                /* Fallthrough */
-        case Opt_sec_ntlmv2:
+        case Opt_ntlm:
-                vol->secFlg |= CIFSSEC_MAY_NTLMV2;
+                vol->sectype = NTLM;
                break;
        case Opt_sec_ntlmv2i:
-                vol->secFlg |= CIFSSEC_MAY_NTLMV2 | CIFSSEC_MUST_SIGN;
+                vol->sign = true;
+                /* Fallthrough */
+        case Opt_sec_ntlmv2:
+                vol->sectype = NTLMv2;
                break;
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
        case Opt_sec_lanman:
-                vol->secFlg |= CIFSSEC_MAY_LANMAN;
+                vol->sectype = LANMAN;
                break;
 #endif
        case Opt_sec_none:
                vol->nullauth = 1;
-                vol->secFlg |= CIFSSEC_MAY_NTLM;
                break;
        default:
                cifs_dbg(VFS, "bad security option: %s\n", value);
@@ -1119,6 +1125,10 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
                vol->ops = &smb30_operations;
                vol->vals = &smb30_values;
                break;
+        case Smb_302:
+                vol->ops = &smb30_operations; /* currently identical with 3.0 */
+                vol->vals = &smb302_values;
+                break;
 #endif
        default:
                cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
@@ -1424,7 +1434,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        vol->local_lease = 1;
                        break;
                case Opt_sign:
-                        vol->secFlg |= CIFSSEC_MUST_SIGN;
+                        vol->sign = true;
                        break;
                case Opt_seal:
                        /* we do not do the following in secFlags because seal
@@ -1455,6 +1465,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                case Opt_sloppy:
                        sloppy = true;
                        break;
+                case Opt_nosharesock:
+                        vol->nosharesock = true;
+                        break;
                /* Numeric Values */
                case Opt_backupuid:
@@ -1662,7 +1675,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        if (string == NULL)
                                goto out_nomem;
-                        if (strnlen(string, 256) == 256) {
+                        if (strnlen(string, CIFS_MAX_DOMAINNAME_LEN)
+                                        == CIFS_MAX_DOMAINNAME_LEN) {
                                printk(KERN_WARNING "CIFS: domain name too"
                                                    " long\n");
                                goto cifs_parse_mount_err;
@@ -1978,47 +1992,21 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
 static bool
 match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
-        unsigned int secFlags;
+        /*
+         * The select_sectype function should either return the vol->sectype
-        if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
+         * that was specified, or "Unspecified" if that sectype was not
-                secFlags = vol->secFlg;
+         * compatible with the given NEGOTIATE request.
-        else
+         */
-                secFlags = global_secflags | vol->secFlg;
+        if (select_sectype(server, vol->sectype) == Unspecified)
-        switch (server->secType) {
-        case LANMAN:
-                if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
-                        return false;
-                break;
-        case NTLMv2:
-                if (!(secFlags & CIFSSEC_MAY_NTLMV2))
-                        return false;
-                break;
-        case NTLM:
-                if (!(secFlags & CIFSSEC_MAY_NTLM))
-                        return false;
-                break;
-        case Kerberos:
-                if (!(secFlags & CIFSSEC_MAY_KRB5))
-                        return false;
-                break;
-        case RawNTLMSSP:
-                if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
-                        return false;
-                break;
-        default:
-                /* shouldn't happen */
                return false;
-        }
-        /* now check if signing mode is acceptable */
+        /*
-        if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
+         * Now check if signing mode is acceptable. No need to check
-            (server->sec_mode & SECMODE_SIGN_REQUIRED))
+         * global_secflags at this point since if MUST_SIGN is set then
-                        return false;
+         * the server->sign had better be too.
-        else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
+         */
-                 (server->sec_mode &
+        if (vol->sign && !server->sign)
-                  (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
+                return false;
-                        return false;
        return true;
 }
@@ -2027,6 +2015,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
        struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
+        if (vol->nosharesock)
+                return 0;
        if ((server->vals != vol->vals) || (server->ops != vol->ops))
                return 0;
@@ -2118,12 +2109,6 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                goto out_err;
        }
-        rc = cifs_crypto_shash_allocate(tcp_ses);
-        if (rc) {
-                cifs_dbg(VFS, "could not setup hash structures rc %d\n", rc);
-                goto out_err;
-        }
        tcp_ses->ops = volume_info->ops;
        tcp_ses->vals = volume_info->vals;
        cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
@@ -2216,7 +2201,11 @@ out_err:
 static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
 {
-        switch (ses->server->secType) {
+        if (vol->sectype != Unspecified &&
+            vol->sectype != ses->sectype)
+                return 0;
+        switch (ses->sectype) {
        case Kerberos:
                if (!uid_eq(vol->cred_uid, ses->cred_uid))
                        return 0;
@@ -2288,8 +2277,8 @@ cifs_put_smb_ses(struct cifs_ses *ses)
 #ifdef CONFIG_KEYS
-/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */
+/* strlen("cifs:a:") + CIFS_MAX_DOMAINNAME_LEN + 1 */
-#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1)
+#define CIFSCREDS_DESC_SIZE (7 + CIFS_MAX_DOMAINNAME_LEN + 1)
 /* Populate username and pw fields from keyring if possible */
 static int
@@ -2493,7 +2482,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        ses->cred_uid = volume_info->cred_uid;
        ses->linux_uid = volume_info->linux_uid;
-        ses->overrideSecFlg = volume_info->secFlg;
+        ses->sectype = volume_info->sectype;
+        ses->sign = volume_info->sign;
        mutex_lock(&ses->session_mutex);
        rc = cifs_negotiate_protocol(xid, ses);
@@ -3656,7 +3646,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
                   NTLMv2 password here) */
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
-                    (ses->server->secType == LANMAN))
+                    (ses->sectype == LANMAN))
                        calc_lanman_hash(tcon->password, ses->server->cryptkey,
                                         ses->server->sec_mode &
                                            SECMODE_PW_ENCRYPT ? true : false,
@@ -3674,8 +3664,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
                }
        }
-        if (ses->server->sec_mode &
+        if (ses->server->sign)
-                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
        if (ses->capabilities & CAP_STATUS32) {
@@ -3738,7 +3727,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
                }
                bcc_ptr += length + 1;
                bytes_left -= (length + 1);
-                strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
+                strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
                /* mostly informational -- no need to fail on error here */
                kfree(tcon->nativeFileSystem);
@@ -3827,7 +3816,6 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
        int rc = -ENOSYS;
        struct TCP_Server_Info *server = ses->server;
-        ses->flags = 0;
        ses->capabilities = server->capabilities;
        if (linuxExtEnabled == 0)
                ses->capabilities &= (~server->vals->cap_unix);
@@ -3848,6 +3836,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
                        server->sequence_number = 0x2;
                        server->session_estab = true;
                        ses->auth_key.response = NULL;
+                        if (server->ops->generate_signingkey)
+                                server->ops->generate_signingkey(server);
                }
                mutex_unlock(&server->srv_mutex);
@@ -3870,23 +3860,11 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
 static int
 cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
 {
-        switch (ses->server->secType) {
+        vol->sectype = ses->sectype;
-        case Kerberos:
-                vol->secFlg = CIFSSEC_MUST_KRB5;
+        /* krb5 is special, since we don't need username or pw */
+        if (vol->sectype == Kerberos)
                return 0;
-        case NTLMv2:
-                vol->secFlg = CIFSSEC_MUST_NTLMV2;
-                break;
-        case NTLM:
-                vol->secFlg = CIFSSEC_MUST_NTLM;
-                break;
-        case RawNTLMSSP:
-                vol->secFlg = CIFSSEC_MUST_NTLMSSP;
-                break;
-        case LANMAN:
-                vol->secFlg = CIFSSEC_MUST_LANMAN;
-                break;
-        }
        return cifs_set_cifscreds(vol, ses);
 }
@@ -3912,6 +3890,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
        vol_info->nocase = master_tcon->nocase;
        vol_info->local_lease = master_tcon->local_lease;
        vol_info->no_linux_ext = !master_tcon->unix_ext;
+        vol_info->sectype = master_tcon->ses->sectype;
+        vol_info->sign = master_tcon->ses->sign;
        rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
        if (rc) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 5699b5036ed8..d62ce0d48141 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -204,6 +204,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
        struct inode *newinode = NULL;
        int disposition;
        struct TCP_Server_Info *server = tcon->ses->server;
+        struct cifs_open_parms oparms;
        *oplock = 0;
        if (tcon->ses->server->oplocks)
@@ -319,9 +320,16 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
        if (backup_cred(cifs_sb))
                create_options |= CREATE_OPEN_BACKUP_INTENT;
-        rc = server->ops->open(xid, tcon, full_path, disposition,
+        oparms.tcon = tcon;
-                               desired_access, create_options, fid, oplock,
+        oparms.cifs_sb = cifs_sb;
-                               buf, cifs_sb);
+        oparms.desired_access = desired_access;
+        oparms.create_options = create_options;
+        oparms.disposition = disposition;
+        oparms.path = full_path;
+        oparms.fid = fid;
+        oparms.reconnect = false;
+        rc = server->ops->open(xid, &oparms, oplock, buf);
        if (rc) {
                cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
                goto out;
@@ -822,8 +830,7 @@ const struct dentry_operations cifs_dentry_ops = {
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
-static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
+static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
-                struct qstr *q)
 {
        struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
        unsigned long hash;
@@ -838,12 +845,10 @@ static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
        return 0;
 }
-static int cifs_ci_compare(const struct dentry *parent,
+static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
-                const struct inode *pinode,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
+        struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls;
        if ((name->len == len) &&
            (nls_strnicmp(codepage, name->name, str, len) == 0))
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 48b29d24c9f4..7e36ae34e947 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -183,6 +183,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
        int create_options = CREATE_NOT_DIR;
        FILE_ALL_INFO *buf;
        struct TCP_Server_Info *server = tcon->ses->server;
+        struct cifs_open_parms oparms;
        if (!server->ops->open)
                return -ENOSYS;
@@ -224,9 +225,16 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
        if (backup_cred(cifs_sb))
                create_options |= CREATE_OPEN_BACKUP_INTENT;
-        rc = server->ops->open(xid, tcon, full_path, disposition,
+        oparms.tcon = tcon;
-                               desired_access, create_options, fid, oplock, buf,
+        oparms.cifs_sb = cifs_sb;
-                               cifs_sb);
+        oparms.desired_access = desired_access;
+        oparms.create_options = create_options;
+        oparms.disposition = disposition;
+        oparms.path = full_path;
+        oparms.fid = fid;
+        oparms.reconnect = false;
+        rc = server->ops->open(xid, &oparms, oplock, buf);
        if (rc)
                goto out;
@@ -553,11 +561,10 @@ cifs_relock_file(struct cifsFileInfo *cfile)
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
        int rc = 0;
-        /* we are going to update can_cache_brlcks here - need a write access */
+        down_read(&cinode->lock_sem);
-        down_write(&cinode->lock_sem);
        if (cinode->can_cache_brlcks) {
-                /* can cache locks - no need to push them */
+                /* can cache locks - no need to relock */
-                up_write(&cinode->lock_sem);
+                up_read(&cinode->lock_sem);
                return rc;
        }
@@ -568,7 +575,7 @@ cifs_relock_file(struct cifsFileInfo *cfile)
        else
                rc = tcon->ses->server->ops->push_mand_locks(cfile);
-        up_write(&cinode->lock_sem);
+        up_read(&cinode->lock_sem);
        return rc;
 }
@@ -587,7 +594,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
        int desired_access;
        int disposition = FILE_OPEN;
        int create_options = CREATE_NOT_DIR;
-        struct cifs_fid fid;
+        struct cifs_open_parms oparms;
        xid = get_xid();
        mutex_lock(&cfile->fh_mutex);
@@ -637,9 +644,10 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
                rc = cifs_posix_open(full_path, NULL, inode->i_sb,
                                     cifs_sb->mnt_file_mode /* ignored */,
-                                     oflags, &oplock, &fid.netfid, xid);
+                                     oflags, &oplock, &cfile->fid.netfid, xid);
                if (rc == 0) {
                        cifs_dbg(FYI, "posix reopen succeeded\n");
+                        oparms.reconnect = true;
                        goto reopen_success;
                }
                /*
@@ -654,7 +662,16 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
                create_options |= CREATE_OPEN_BACKUP_INTENT;
        if (server->ops->get_lease_key)
-                server->ops->get_lease_key(inode, &fid);
+                server->ops->get_lease_key(inode, &cfile->fid);
+        oparms.tcon = tcon;
+        oparms.cifs_sb = cifs_sb;
+        oparms.desired_access = desired_access;
+        oparms.create_options = create_options;
+        oparms.disposition = disposition;
+        oparms.path = full_path;
+        oparms.fid = &cfile->fid;
+        oparms.reconnect = true;
        /*
         * Can not refresh inode by passing in file_info buf to be returned by
@@ -663,9 +680,14 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
         * version of file size can be stale. If we knew for sure that inode was
         * not dirty locally we could do this.
         */
-        rc = server->ops->open(xid, tcon, full_path, disposition,
+        rc = server->ops->open(xid, &oparms, &oplock, NULL);
-                               desired_access, create_options, &fid, &oplock,
+        if (rc == -ENOENT && oparms.reconnect == false) {
-                               NULL, cifs_sb);
+                /* durable handle timeout is expired - open the file again */
+                rc = server->ops->open(xid, &oparms, &oplock, NULL);
+                /* indicate that we need to relock the file */
+                oparms.reconnect = true;
+        }
        if (rc) {
                mutex_unlock(&cfile->fh_mutex);
                cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc);
@@ -696,8 +718,9 @@ reopen_success:
         * to the server to get the new inode info.
         */
-        server->ops->set_fid(cfile, &fid, oplock);
+        server->ops->set_fid(cfile, &cfile->fid, oplock);
-        cifs_relock_file(cfile);
+        if (oparms.reconnect)
+                cifs_relock_file(cfile);
 reopen_error_exit:
        kfree(full_path);
@@ -999,7 +1022,7 @@ try_again:
                rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next);
                if (!rc)
                        goto try_again;
-                locks_delete_block(flock);
+                posix_unblock_lock(flock);
        }
        return rc;
 }
@@ -1092,6 +1115,7 @@ struct lock_to_push {
 static int
 cifs_push_posix_locks(struct cifsFileInfo *cfile)
 {
+        struct inode *inode = cfile->dentry->d_inode;
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
        struct file_lock *flock, **before;
        unsigned int count = 0, i = 0;
@@ -1102,12 +1126,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
        xid = get_xid();
-        lock_flocks();
+        spin_lock(&inode->i_lock);
-        cifs_for_each_lock(cfile->dentry->d_inode, before) {
+        cifs_for_each_lock(inode, before) {
                if ((*before)->fl_flags & FL_POSIX)
                        count++;
        }
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        INIT_LIST_HEAD(&locks_to_send);
@@ -1126,8 +1150,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
        }
        el = locks_to_send.next;
-        lock_flocks();
+        spin_lock(&inode->i_lock);
-        cifs_for_each_lock(cfile->dentry->d_inode, before) {
+        cifs_for_each_lock(inode, before) {
                flock = *before;
                if ((flock->fl_flags & FL_POSIX) == 0)
                        continue;
@@ -1152,7 +1176,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
                lck->offset = flock->fl_start;
                el = el->next;
        }
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
                int stored_rc;
@@ -3546,11 +3570,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
        return cifs_fscache_release_page(page, gfp);
 }
-static void cifs_invalidate_page(struct page *page, unsigned long offset)
+static void cifs_invalidate_page(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
-        if (offset == 0)
+        if (offset == 0 && length == PAGE_CACHE_SIZE)
                cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 20efd81266c6..449b6cf09b09 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -558,6 +558,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
                        fattr->cf_mode &= ~(S_IWUGO);
                fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+                if (fattr->cf_nlink < 1) {
+                        cifs_dbg(1, "replacing bogus file nlink value %u\n",
+                                fattr->cf_nlink);
+                        fattr->cf_nlink = 1;
+                }
        }
        fattr->cf_uid = cifs_sb->mnt_uid;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index b83c3f5646bd..562044f700e5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -305,67 +305,89 @@ CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
 }
 int
-CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
-                   const unsigned char *path,
+                        unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
-                   struct cifs_sb_info *cifs_sb, unsigned int xid)
+                        unsigned int xid)
 {
        int rc;
        int oplock = 0;
        __u16 netfid = 0;
        struct tcon_link *tlink;
-        struct cifs_tcon *pTcon;
+        struct cifs_tcon *ptcon;
        struct cifs_io_parms io_parms;
-        u8 *buf;
-        char *pbuf;
-        unsigned int bytes_read = 0;
        int buf_type = CIFS_NO_BUFFER;
-        unsigned int link_len = 0;
        FILE_ALL_INFO file_info;
-        if (!CIFSCouldBeMFSymlink(fattr))
-                /* it's not a symlink */
-                return 0;
        tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
                return PTR_ERR(tlink);
-        pTcon = tlink_tcon(tlink);
+        ptcon = tlink_tcon(tlink);
-        rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
+        rc = CIFSSMBOpen(xid, ptcon, path, FILE_OPEN, GENERIC_READ,
                         CREATE_NOT_DIR, &netfid, &oplock, &file_info,
                         cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags &
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc != 0)
+        if (rc != 0) {
-                goto out;
+                cifs_put_tlink(tlink);
+                return rc;
+        }
        if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
-                CIFSSMBClose(xid, pTcon, netfid);
+                CIFSSMBClose(xid, ptcon, netfid);
+                cifs_put_tlink(tlink);
                /* it's not a symlink */
-                goto out;
+                return rc;
        }
-        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
-        if (!buf) {
-                rc = -ENOMEM;
-                goto out;
-        }
-        pbuf = buf;
        io_parms.netfid = netfid;
        io_parms.pid = current->tgid;
-        io_parms.tcon = pTcon;
+        io_parms.tcon = ptcon;
        io_parms.offset = 0;
        io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
-        rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
+        rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
-        CIFSSMBClose(xid, pTcon, netfid);
+        CIFSSMBClose(xid, ptcon, netfid);
-        if (rc != 0) {
+        cifs_put_tlink(tlink);
-                kfree(buf);
+        return rc;
+}
+int
+CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+                   const unsigned char *path,
+                   struct cifs_sb_info *cifs_sb, unsigned int xid)
+{
+        int rc = 0;
+        u8 *buf = NULL;
+        unsigned int link_len = 0;
+        unsigned int bytes_read = 0;
+        struct cifs_tcon *ptcon;
+        if (!CIFSCouldBeMFSymlink(fattr))
+                /* it's not a symlink */
+                return 0;
+        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+        if (!buf) {
+                rc = -ENOMEM;
                goto out;
        }
+        ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb));
+        if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink))
+                rc = ptcon->ses->server->ops->query_mf_symlink(path, buf,
+                                                 &bytes_read, cifs_sb, xid);
+        else
+                goto out;
+        if (rc != 0)
+                goto out;
+        if (bytes_read == 0) /* not a symlink */
+                goto out;
        rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
-        kfree(buf);
        if (rc == -EINVAL) {
                /* it's not a symlink */
                rc = 0;
@@ -381,7 +403,7 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
        fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
        fattr->cf_dtype = DT_LNK;
 out:
-        cifs_put_tlink(tlink);
+        kfree(buf);
        return rc;
 }
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1bec014779fd..f7d4b2285efe 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -267,8 +267,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                if (treeCon->nocase)
                        buffer->Flags  |= SMBFLG_CASELESS;
                if ((treeCon->ses) && (treeCon->ses->server))
-                        if (treeCon->ses->server->sec_mode &
+                        if (treeCon->ses->server->sign)
-                          (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                                buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
        }
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 770d5a9781c1..69d2c826a23b 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -111,6 +111,14 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
                        return;
        }
+        /*
+         * If we know that the inode will need to be revalidated immediately,
+         * then don't create a new dentry for it. We'll end up doing an on
+         * the wire call either way and this spares us an invalidation.
+         */
+        if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
+                return;
        dentry = d_alloc(parent, name);
        if (!dentry)
                return;
@@ -126,6 +134,22 @@ out:
        dput(dentry);
 }
+/*
+ * Is it possible that this directory might turn out to be a DFS referral
+ * once we go to try and use it?
+ */
+static bool
+cifs_dfs_is_possible(struct cifs_sb_info *cifs_sb)
+{
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+        if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+                return true;
+#endif
+        return false;
+}
 static void
 cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
 {
@@ -135,6 +159,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
        if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
                fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
                fattr->cf_dtype = DT_DIR;
+                /*
+                 * Windows CIFS servers generally make DFS referrals look
+                 * like directories in FIND_* responses with the reparse
+                 * attribute flag also set (since DFS junctions are
+                 * reparse points). We must revalidate at least these
+                 * directory inodes before trying to use them (if
+                 * they are DFS we will get PATH_NOT_COVERED back
+                 * when queried directly and can then try to connect
+                 * to the DFS target)
+                 */
+                if (cifs_dfs_is_possible(cifs_sb) &&
+                    (fattr->cf_cifsattrs & ATTR_REPARSE))
+                        fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
        } else {
                fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
                fattr->cf_dtype = DT_REG;
@@ -537,14 +574,14 @@ static int cifs_save_resume_key(const char *current_entry,
 * every entry (do not increment for . or .. entry).
 */
 static int
-find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
+find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
                struct file *file, char **current_entry, int *num_to_ret)
 {
        __u16 search_flags;
        int rc = 0;
        int pos_in_buf = 0;
        loff_t first_entry_in_buffer;
-        loff_t index_to_find = file->f_pos;
+        loff_t index_to_find = pos;
        struct cifsFileInfo *cfile = file->private_data;
        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        struct TCP_Server_Info *server = tcon->ses->server;
@@ -659,8 +696,9 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
        return rc;
 }
-static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
+static int cifs_filldir(char *find_entry, struct file *file,
-                void *dirent, char *scratch_buf, unsigned int max_len)
+                struct dir_context *ctx,
+                char *scratch_buf, unsigned int max_len)
 {
        struct cifsFileInfo *file_info = file->private_data;
        struct super_block *sb = file->f_path.dentry->d_sb;
@@ -740,13 +778,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
        cifs_prime_dcache(file->f_dentry, &name, &fattr);
        ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
-        rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
+        return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
-                     fattr.cf_dtype);
-        return rc;
 }
-int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
+int cifs_readdir(struct file *file, struct dir_context *ctx)
 {
        int rc = 0;
        unsigned int xid;
@@ -772,103 +808,86 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                        goto rddir2_exit;
        }
-        switch ((int) file->f_pos) {
+        if (!dir_emit_dots(file, ctx))
-        case 0:
+                goto rddir2_exit;
-                if (filldir(direntry, ".", 1, file->f_pos,
-                     file_inode(file)->i_ino, DT_DIR) < 0) {
-                        cifs_dbg(VFS, "Filldir for current dir failed\n");
-                        rc = -ENOMEM;
-                        break;
-                }
-                file->f_pos++;
-        case 1:
-                if (filldir(direntry, "..", 2, file->f_pos,
-                     parent_ino(file->f_path.dentry), DT_DIR) < 0) {
-                        cifs_dbg(VFS, "Filldir for parent dir failed\n");
-                        rc = -ENOMEM;
-                        break;
-                }
-                file->f_pos++;
-        default:
-                /* 1) If search is active,
-                        is in current search buffer?
-                        if it before then restart search
-                        if after then keep searching till find it */
-                if (file->private_data == NULL) {
-                        rc = -EINVAL;
-                        free_xid(xid);
-                        return rc;
-                }
-                cifsFile = file->private_data;
-                if (cifsFile->srch_inf.endOfSearch) {
-                        if (cifsFile->srch_inf.emptyDir) {
-                                cifs_dbg(FYI, "End of search, empty dir\n");
-                                rc = 0;
-                                break;
-                        }
-                } /* else {
-                        cifsFile->invalidHandle = true;
-                        tcon->ses->server->close(xid, tcon, &cifsFile->fid);
-                } */
-                tcon = tlink_tcon(cifsFile->tlink);
+        /* 1) If search is active,
-                rc = find_cifs_entry(xid, tcon, file, &current_entry,
+                is in current search buffer?
-                                     &num_to_fill);
+                if it before then restart search
-                if (rc) {
+                if after then keep searching till find it */
-                        cifs_dbg(FYI, "fce error %d\n", rc);
-                        goto rddir2_exit;
+        if (file->private_data == NULL) {
-                } else if (current_entry != NULL) {
+                rc = -EINVAL;
-                        cifs_dbg(FYI, "entry %lld found\n", file->f_pos);
+                goto rddir2_exit;
-                } else {
+        }
-                        cifs_dbg(FYI, "could not find entry\n");
+        cifsFile = file->private_data;
+        if (cifsFile->srch_inf.endOfSearch) {
+                if (cifsFile->srch_inf.emptyDir) {
+                        cifs_dbg(FYI, "End of search, empty dir\n");
+                        rc = 0;
                        goto rddir2_exit;
                }
-                cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
+        } /* else {
-                         num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
+                cifsFile->invalidHandle = true;
-                max_len = tcon->ses->server->ops->calc_smb_size(
+                tcon->ses->server->close(xid, tcon, &cifsFile->fid);
-                                cifsFile->srch_inf.ntwrk_buf_start);
+        } */
-                end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
+        tcon = tlink_tcon(cifsFile->tlink);
-                tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+        rc = find_cifs_entry(xid, tcon, ctx->pos, file, &current_entry,
-                if (tmp_buf == NULL) {
+                             &num_to_fill);
-                        rc = -ENOMEM;
+        if (rc) {
+                cifs_dbg(FYI, "fce error %d\n", rc);
+                goto rddir2_exit;
+        } else if (current_entry != NULL) {
+                cifs_dbg(FYI, "entry %lld found\n", ctx->pos);
+        } else {
+                cifs_dbg(FYI, "could not find entry\n");
+                goto rddir2_exit;
+        }
+        cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
+                 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
+        max_len = tcon->ses->server->ops->calc_smb_size(
+                        cifsFile->srch_inf.ntwrk_buf_start);
+        end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
+        tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+        if (tmp_buf == NULL) {
+                rc = -ENOMEM;
+                goto rddir2_exit;
+        }
+        for (i = 0; i < num_to_fill; i++) {
+                if (current_entry == NULL) {
+                        /* evaluate whether this case is an error */
+                        cifs_dbg(VFS, "past SMB end,  num to fill %d i %d\n",
+                                 num_to_fill, i);
                        break;
                }
+                /*
-                for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
+                 * if buggy server returns . and .. late do we want to
-                        if (current_entry == NULL) {
+                 * check for that here?
-                                /* evaluate whether this case is an error */
+                 */
-                                cifs_dbg(VFS, "past SMB end,  num to fill %d i %d\n",
+                rc = cifs_filldir(current_entry, file, ctx,
-                                         num_to_fill, i);
+                                  tmp_buf, max_len);
-                                break;
+                if (rc) {
-                        }
+                        if (rc > 0)
-                        /*
-                         * if buggy server returns . and .. late do we want to
-                         * check for that here?
-                         */
-                        rc = cifs_filldir(current_entry, file, filldir,
-                                          direntry, tmp_buf, max_len);
-                        if (rc == -EOVERFLOW) {
                                rc = 0;
-                                break;
+                        break;
-                        }
-                        file->f_pos++;
-                        if (file->f_pos ==
-                                cifsFile->srch_inf.index_of_last_entry) {
-                                cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
-                                         file->f_pos, tmp_buf);
-                                cifs_save_resume_key(current_entry, cifsFile);
-                                break;
-                        } else
-                                current_entry =
-                                        nxt_dir_entry(current_entry, end_of_smb,
-                                                cifsFile->srch_inf.info_level);
                }
-                kfree(tmp_buf);
-                break;
+                ctx->pos++;
-        } /* end switch */
+                if (ctx->pos ==
+                        cifsFile->srch_inf.index_of_last_entry) {
+                        cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
+                                 ctx->pos, tmp_buf);
+                        cifs_save_resume_key(current_entry, cifsFile);
+                        break;
+                } else
+                        current_entry =
+                                nxt_dir_entry(current_entry, end_of_smb,
+                                        cifsFile->srch_inf.info_level);
+        }
+        kfree(tmp_buf);
 rddir2_exit:
        free_xid(xid);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index f230571a7ab3..08dd37bb23aa 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -138,8 +138,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
        capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
                        CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
-        if (ses->server->sec_mode &
+        if (ses->server->sign)
-            (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
        if (ses->capabilities & CAP_UNICODE) {
@@ -198,7 +197,7 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
                bytes_ret = 0;
        } else
                bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName,
-                                            256, nls_cp);
+                                            CIFS_MAX_DOMAINNAME_LEN, nls_cp);
        bcc_ptr += 2 * bytes_ret;
        bcc_ptr += 2;  /* account for null terminator */
@@ -256,8 +255,8 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
        /* copy domain */
        if (ses->domainName != NULL) {
-                strncpy(bcc_ptr, ses->domainName, 256);
+                strncpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
-                bcc_ptr += strnlen(ses->domainName, 256);
+                bcc_ptr += strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
        } /* else we will send a null domain name
             so the server will default to its own domain */
        *bcc_ptr = 0;
@@ -310,11 +309,10 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
        return;
 }
-static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
+static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
-                               struct cifs_ses *ses,
+                                struct cifs_ses *ses,
-                               const struct nls_table *nls_cp)
+                                const struct nls_table *nls_cp)
 {
-        int rc = 0;
        int len;
        char *bcc_ptr = *pbcc_area;
@@ -322,24 +320,22 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
        len = strnlen(bcc_ptr, bleft);
        if (len >= bleft)
-                return rc;
+                return;
        kfree(ses->serverOS);
        ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
        if (ses->serverOS)
                strncpy(ses->serverOS, bcc_ptr, len);
-        if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
+        if (strncmp(ses->serverOS, "OS/2", 4) == 0)
                cifs_dbg(FYI, "OS/2 server\n");
-                        ses->flags |= CIFS_SES_OS2;
-        }
        bcc_ptr += len + 1;
        bleft -= len + 1;
        len = strnlen(bcc_ptr, bleft);
        if (len >= bleft)
-                return rc;
+                return;
        kfree(ses->serverNOS);
@@ -352,7 +348,7 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
        len = strnlen(bcc_ptr, bleft);
        if (len > bleft)
-                return rc;
+                return;
        /* No domain field in LANMAN case. Domain is
           returned by old servers in the SMB negprot response */
@@ -360,8 +356,6 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
           but thus do return domain here we could add parsing
           for it later, but it is not very important */
        cifs_dbg(FYI, "ascii: bytes left %d\n", bleft);
-        return rc;
 }
 int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
@@ -432,8 +426,7 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
        flags = NTLMSSP_NEGOTIATE_56 |  NTLMSSP_REQUEST_TARGET |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
-        if (ses->server->sec_mode &
+        if (ses->server->sign) {
-                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                flags |= NTLMSSP_NEGOTIATE_SIGN;
                if (!ses->server->session_estab)
                        flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
@@ -471,8 +464,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
                NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
                NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
-        if (ses->server->sec_mode &
+        if (ses->server->sign) {
-           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                flags |= NTLMSSP_NEGOTIATE_SIGN;
                if (!ses->server->session_estab)
                        flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
@@ -558,6 +550,56 @@ setup_ntlmv2_ret:
        return rc;
 }
+enum securityEnum
+select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
+{
+        switch (server->negflavor) {
+        case CIFS_NEGFLAVOR_EXTENDED:
+                switch (requested) {
+                case Kerberos:
+                case RawNTLMSSP:
+                        return requested;
+                case Unspecified:
+                        if (server->sec_ntlmssp &&
+                            (global_secflags & CIFSSEC_MAY_NTLMSSP))
+                                return RawNTLMSSP;
+                        if ((server->sec_kerberos || server->sec_mskerberos) &&
+                            (global_secflags & CIFSSEC_MAY_KRB5))
+                                return Kerberos;
+                        /* Fallthrough */
+                default:
+                        return Unspecified;
+                }
+        case CIFS_NEGFLAVOR_UNENCAP:
+                switch (requested) {
+                case NTLM:
+                case NTLMv2:
+                        return requested;
+                case Unspecified:
+                        if (global_secflags & CIFSSEC_MAY_NTLMV2)
+                                return NTLMv2;
+                        if (global_secflags & CIFSSEC_MAY_NTLM)
+                                return NTLM;
+                        /* Fallthrough */
+                default:
+                        return Unspecified;
+                }
+        case CIFS_NEGFLAVOR_LANMAN:
+                switch (requested) {
+                case LANMAN:
+                        return requested;
+                case Unspecified:
+                        if (global_secflags & CIFSSEC_MAY_LANMAN)
+                                return LANMAN;
+                        /* Fallthrough */
+                default:
+                        return Unspecified;
+                }
+        default:
+                return Unspecified;
+        }
+}
 int
 CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
               const struct nls_table *nls_cp)
@@ -579,11 +621,18 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
        u16 blob_len;
        char *ntlmsspblob = NULL;
-        if (ses == NULL)
+        if (ses == NULL) {
+                WARN(1, "%s: ses == NULL!", __func__);
                return -EINVAL;
+        }
-        type = ses->server->secType;
+        type = select_sectype(ses->server, ses->sectype);
        cifs_dbg(FYI, "sess setup type %d\n", type);
+        if (type == Unspecified) {
+                cifs_dbg(VFS, "Unable to select appropriate authentication method!");
+                return -EINVAL;
+        }
        if (type == RawNTLMSSP) {
                /* if memory allocation is successful, caller of this function
                 * frees it.
@@ -643,8 +692,6 @@ ssetup_ntlmssp_authenticate:
        }
        bcc_ptr = str_area;
-        ses->flags &= ~CIFS_SES_LANMAN;
        iov[1].iov_base = NULL;
        iov[1].iov_len = 0;
@@ -668,7 +715,6 @@ ssetup_ntlmssp_authenticate:
                                 ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
                                        true : false, lnm_session_key);
-                ses->flags |= CIFS_SES_LANMAN;
                memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
                bcc_ptr += CIFS_AUTH_RESP_SIZE;
@@ -938,8 +984,7 @@ ssetup_ntlmssp_authenticate:
                }
                decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
        } else {
-                rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining,
+                decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
-                                         ses, nls_cp);
        }
 ssetup_exit:
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 3efdb9d5c0b8..60943978aec3 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -449,8 +449,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
         * WRITEX header, not including the 4 byte RFC1001 length.
         */
        if (!(server->capabilities & CAP_LARGE_WRITE_X) ||
-            (!(server->capabilities & CAP_UNIX) &&
+            (!(server->capabilities & CAP_UNIX) && server->sign))
-             (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED))))
                wsize = min_t(unsigned int, wsize,
                                server->maxBuf - sizeof(WRITE_REQ) + 4);
@@ -675,20 +674,23 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
 }
 static int
-cifs_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path,
+cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
-               int disposition, int desired_access, int create_options,
+               __u32 *oplock, FILE_ALL_INFO *buf)
-               struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf,
+{
-               struct cifs_sb_info *cifs_sb)
+        if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS))
-{
+                return SMBLegacyOpen(xid, oparms->tcon, oparms->path,
-        if (!(tcon->ses->capabilities & CAP_NT_SMBS))
+                                     oparms->disposition,
-                return SMBLegacyOpen(xid, tcon, path, disposition,
+                                     oparms->desired_access,
-                                     desired_access, create_options,
+                                     oparms->create_options,
-                                     &fid->netfid, oplock, buf,
+                                     &oparms->fid->netfid, oplock, buf,
-                                     cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
+                                     oparms->cifs_sb->local_nls,
+                                     oparms->cifs_sb->mnt_cifs_flags
                                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        return CIFSSMBOpen(xid, tcon, path, disposition, desired_access,
+        return CIFSSMBOpen(xid, oparms->tcon, oparms->path,
-                           create_options, &fid->netfid, oplock, buf,
+                           oparms->disposition, oparms->desired_access,
-                           cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                           oparms->create_options, &oparms->fid->netfid, oplock,
+                           buf, oparms->cifs_sb->local_nls,
+                           oparms->cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
 }
@@ -765,20 +767,14 @@ smb_set_file_info(struct inode *inode, const char *full_path,
        }
        tcon = tlink_tcon(tlink);
-        /*
+        rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls,
-         * NT4 apparently returns success on this call, but it doesn't really
-         * work.
-         */
-        if (!(tcon->ses->flags & CIFS_SES_NT4)) {
-                rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf,
-                                        cifs_sb->local_nls,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                if (rc == 0) {
+        if (rc == 0) {
-                        cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
+                cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
-                        goto out;
+                goto out;
-                } else if (rc != -EOPNOTSUPP && rc != -EINVAL)
+        } else if (rc != -EOPNOTSUPP && rc != -EINVAL) {
-                        goto out;
+                goto out;
        }
        cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
@@ -948,6 +944,7 @@ struct smb_version_operations smb1_operations = {
        .mand_lock = cifs_mand_lock,
        .mand_unlock_range = cifs_unlock_range,
        .push_mand_locks = cifs_push_mandatory_locks,
+        .query_mf_symlink = open_query_close_cifs_symlink,
 };
 struct smb_version_values smb1_values = {
@@ -964,4 +961,6 @@ struct smb_version_values smb1_values = {
        .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
        .cap_large_files = CAP_LARGE_FILES,
        .oplock_read = OPLOCK_READ,
+        .signing_enabled = SECMODE_SIGN_ENABLED,
+        .signing_required = SECMODE_SIGN_REQUIRED,
 };
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 5da1b55a2258..04a81a4142c3 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -40,7 +40,8 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
        oplock &= 0xFF;
        if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
                return;
-        if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
+        if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE ||
+            oplock == SMB2_OPLOCK_LEVEL_BATCH) {
                cinode->clientCanCacheAll = true;
                cinode->clientCanCacheRead = true;
                cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
@@ -57,17 +58,16 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
 }
 int
-smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path,
+smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
-               int disposition, int desired_access, int create_options,
+               __u32 *oplock, FILE_ALL_INFO *buf)
-               struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf,
-               struct cifs_sb_info *cifs_sb)
 {
        int rc;
        __le16 *smb2_path;
        struct smb2_file_all_info *smb2_data = NULL;
        __u8 smb2_oplock[17];
+        struct cifs_fid *fid = oparms->fid;
-        smb2_path = cifs_convert_path_to_utf16(path, cifs_sb);
+        smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
        if (smb2_path == NULL) {
                rc = -ENOMEM;
                goto out;
@@ -80,21 +80,19 @@ smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path,
                goto out;
        }
-        desired_access |= FILE_READ_ATTRIBUTES;
+        oparms->desired_access |= FILE_READ_ATTRIBUTES;
-        *smb2_oplock = SMB2_OPLOCK_LEVEL_EXCLUSIVE;
+        *smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
-        if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
+        if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
                memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE);
-        rc = SMB2_open(xid, tcon, smb2_path, &fid->persistent_fid,
+        rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data);
-                       &fid->volatile_fid, desired_access, disposition,
-                       0, 0, smb2_oplock, smb2_data);
        if (rc)
                goto out;
        if (buf) {
                /* open response does not have IndexNumber field - get it */
-                rc = SMB2_get_srv_num(xid, tcon, fid->persistent_fid,
+                rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid,
                                      fid->volatile_fid,
                                      &smb2_data->IndexNumber);
                if (rc) {
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 7c0e2143e775..c38350851b08 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -54,5 +54,7 @@
 #define SMB2_SIGNATURE_SIZE (16)
 #define SMB2_NTLMV2_SESSKEY_SIZE (16)
 #define SMB2_HMACSHA256_SIZE (32)
+#define SMB2_CMACAES_SIZE (16)
+#define SMB3_SIGNKEY_SIZE (16)
 #endif  /* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index fff6dfba6204..c6ec1633309a 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -41,21 +41,26 @@ static int
 smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
                   struct cifs_sb_info *cifs_sb, const char *full_path,
                   __u32 desired_access, __u32 create_disposition,
-                   __u32 file_attributes, __u32 create_options,
+                   __u32 create_options, void *data, int command)
-                   void *data, int command)
 {
        int rc, tmprc = 0;
-        u64 persistent_fid, volatile_fid;
        __le16 *utf16_path;
        __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+        struct cifs_open_parms oparms;
+        struct cifs_fid fid;
        utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
        if (!utf16_path)
                return -ENOMEM;
-        rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid,
+        oparms.tcon = tcon;
-                       desired_access, create_disposition, file_attributes,
+        oparms.desired_access = desired_access;
-                       create_options, &oplock, NULL);
+        oparms.disposition = create_disposition;
+        oparms.create_options = create_options;
+        oparms.fid = &fid;
+        oparms.reconnect = false;
+        rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
        if (rc) {
                kfree(utf16_path);
                return rc;
@@ -65,8 +70,8 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
        case SMB2_OP_DELETE:
                break;
        case SMB2_OP_QUERY_INFO:
-                tmprc = SMB2_query_info(xid, tcon, persistent_fid,
+                tmprc = SMB2_query_info(xid, tcon, fid.persistent_fid,
-                                        volatile_fid,
+                                        fid.volatile_fid,
                                        (struct smb2_file_all_info *)data);
                break;
        case SMB2_OP_MKDIR:
@@ -76,19 +81,21 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
                 */
                break;
        case SMB2_OP_RENAME:
-                tmprc = SMB2_rename(xid, tcon, persistent_fid, volatile_fid,
+                tmprc = SMB2_rename(xid, tcon, fid.persistent_fid,
-                                    (__le16 *)data);
+                                    fid.volatile_fid, (__le16 *)data);
                break;
        case SMB2_OP_HARDLINK:
-                tmprc = SMB2_set_hardlink(xid, tcon, persistent_fid,
+                tmprc = SMB2_set_hardlink(xid, tcon, fid.persistent_fid,
-                                          volatile_fid, (__le16 *)data);
+                                          fid.volatile_fid, (__le16 *)data);
                break;
        case SMB2_OP_SET_EOF:
-                tmprc = SMB2_set_eof(xid, tcon, persistent_fid, volatile_fid,
+                tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid,
-                                     current->tgid, (__le64 *)data);
+                                     fid.volatile_fid, current->tgid,
+                                     (__le64 *)data);
                break;
        case SMB2_OP_SET_INFO:
-                tmprc = SMB2_set_info(xid, tcon, persistent_fid, volatile_fid,
+                tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid,
+                                      fid.volatile_fid,
                                      (FILE_BASIC_INFO *)data);
                break;
        default:
@@ -96,7 +103,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
                break;
        }
-        rc = SMB2_close(xid, tcon, persistent_fid, volatile_fid);
+        rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
        if (tmprc)
                rc = tmprc;
        kfree(utf16_path);
@@ -129,8 +136,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
                return -ENOMEM;
        rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
-                                FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0,
+                                FILE_READ_ATTRIBUTES, FILE_OPEN, 0, smb2_data,
-                                smb2_data, SMB2_OP_QUERY_INFO);
+                                SMB2_OP_QUERY_INFO);
        if (rc)
                goto out;
@@ -145,7 +152,7 @@ smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
           struct cifs_sb_info *cifs_sb)
 {
        return smb2_open_op_close(xid, tcon, cifs_sb, name,
-                                  FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0,
+                                  FILE_WRITE_ATTRIBUTES, FILE_CREATE,
                                  CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
 }
@@ -164,7 +171,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
        dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
        data.Attributes = cpu_to_le32(dosattrs);
        tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name,
-                                   FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0,
+                                   FILE_WRITE_ATTRIBUTES, FILE_CREATE,
                                   CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
        if (tmprc == 0)
                cifs_i->cifsAttrs = dosattrs;
@@ -175,7 +182,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
           struct cifs_sb_info *cifs_sb)
 {
        return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
-                                  0, CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
+                                  CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
                                  NULL, SMB2_OP_DELETE);
 }
@@ -184,7 +191,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
            struct cifs_sb_info *cifs_sb)
 {
        return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
-                                  0, CREATE_DELETE_ON_CLOSE, NULL,
+                                  CREATE_DELETE_ON_CLOSE, NULL,
                                  SMB2_OP_DELETE);
 }
@@ -203,7 +210,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
        }
        rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access,
-                                FILE_OPEN, 0, 0, smb2_to_name, command);
+                                FILE_OPEN, 0, smb2_to_name, command);
 smb2_rename_path:
        kfree(smb2_to_name);
        return rc;
@@ -234,7 +241,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
 {
        __le64 eof = cpu_to_le64(size);
        return smb2_open_op_close(xid, tcon, cifs_sb, full_path,
-                                  FILE_WRITE_DATA, FILE_OPEN, 0, 0, &eof,
+                                  FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
                                  SMB2_OP_SET_EOF);
 }
@@ -250,7 +257,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
        if (IS_ERR(tlink))
                return PTR_ERR(tlink);
        rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path,
-                                FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, 0, buf,
+                                FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
                                SMB2_OP_SET_INFO);
        cifs_put_tlink(tlink);
        return rc;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 10383d8c015b..b0c43345cd98 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -266,6 +266,10 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
                  ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength);
                break;
        case SMB2_IOCTL:
+                *off = le32_to_cpu(
+                  ((struct smb2_ioctl_rsp *)hdr)->OutputOffset);
+                *len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount);
+                break;
        case SMB2_CHANGE_NOTIFY:
        default:
                /* BB FIXME for unimplemented cases above */
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index f2e76f3b0c61..f259e6cc8357 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -213,22 +213,29 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
                        struct cifs_sb_info *cifs_sb, const char *full_path)
 {
        int rc;
-        __u64 persistent_fid, volatile_fid;
        __le16 *utf16_path;
        __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+        struct cifs_open_parms oparms;
+        struct cifs_fid fid;
        utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
        if (!utf16_path)
                return -ENOMEM;
-        rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid,
+        oparms.tcon = tcon;
-                       FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL);
+        oparms.desired_access = FILE_READ_ATTRIBUTES;
+        oparms.disposition = FILE_OPEN;
+        oparms.create_options = 0;
+        oparms.fid = &fid;
+        oparms.reconnect = false;
+        rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
        if (rc) {
                kfree(utf16_path);
                return rc;
        }
-        rc = SMB2_close(xid, tcon, persistent_fid, volatile_fid);
+        rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
        kfree(utf16_path);
        return rc;
 }
@@ -281,6 +288,25 @@ smb2_clear_stats(struct cifs_tcon *tcon)
 }
 static void
+smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
+{
+        seq_puts(m, "\n\tShare Capabilities:");
+        if (tcon->capabilities & SMB2_SHARE_CAP_DFS)
+                seq_puts(m, " DFS,");
+        if (tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
+                seq_puts(m, " CONTINUOUS AVAILABILITY,");
+        if (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT)
+                seq_puts(m, " SCALEOUT,");
+        if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER)
+                seq_puts(m, " CLUSTER,");
+        if (tcon->capabilities & SMB2_SHARE_CAP_ASYMMETRIC)
+                seq_puts(m, " ASYMMETRIC,");
+        if (tcon->capabilities == 0)
+                seq_puts(m, " None");
+        seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
+}
+static void
 smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
 {
 #ifdef CONFIG_CIFS_STATS
@@ -292,7 +318,6 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
        seq_printf(m, "\nSessionSetups: %d sent %d failed",
                   atomic_read(&sent[SMB2_SESSION_SETUP_HE]),
                   atomic_read(&failed[SMB2_SESSION_SETUP_HE]));
-#define SMB2LOGOFF              0x0002 /* trivial request/resp */
        seq_printf(m, "\nLogoffs: %d sent %d failed",
                   atomic_read(&sent[SMB2_LOGOFF_HE]),
                   atomic_read(&failed[SMB2_LOGOFF_HE]));
@@ -425,15 +450,20 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
        __le16 *utf16_path;
        int rc;
        __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
-        __u64 persistent_fid, volatile_fid;
+        struct cifs_open_parms oparms;
        utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
        if (!utf16_path)
                return -ENOMEM;
-        rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid,
+        oparms.tcon = tcon;
-                       FILE_READ_ATTRIBUTES | FILE_READ_DATA, FILE_OPEN, 0, 0,
+        oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
-                       &oplock, NULL);
+        oparms.disposition = FILE_OPEN;
+        oparms.create_options = 0;
+        oparms.fid = fid;
+        oparms.reconnect = false;
+        rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
        kfree(utf16_path);
        if (rc) {
                cifs_dbg(VFS, "open dir failed\n");
@@ -442,14 +472,12 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
        srch_inf->entries_in_buffer = 0;
        srch_inf->index_of_last_entry = 0;
-        fid->persistent_fid = persistent_fid;
-        fid->volatile_fid = volatile_fid;
-        rc = SMB2_query_directory(xid, tcon, persistent_fid, volatile_fid, 0,
+        rc = SMB2_query_directory(xid, tcon, fid->persistent_fid,
-                                  srch_inf);
+                                  fid->volatile_fid, 0, srch_inf);
        if (rc) {
                cifs_dbg(VFS, "query directory failed\n");
-                SMB2_close(xid, tcon, persistent_fid, volatile_fid);
+                SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
        }
        return rc;
 }
@@ -510,17 +538,25 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
             struct kstatfs *buf)
 {
        int rc;
-        u64 persistent_fid, volatile_fid;
        __le16 srch_path = 0; /* Null - open root of share */
        u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+        struct cifs_open_parms oparms;
+        struct cifs_fid fid;
+        oparms.tcon = tcon;
+        oparms.desired_access = FILE_READ_ATTRIBUTES;
+        oparms.disposition = FILE_OPEN;
+        oparms.create_options = 0;
+        oparms.fid = &fid;
+        oparms.reconnect = false;
-        rc = SMB2_open(xid, tcon, &srch_path, &persistent_fid, &volatile_fid,
+        rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL);
-                       FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL);
        if (rc)
                return rc;
        buf->f_type = SMB2_MAGIC_NUMBER;
-        rc = SMB2_QFS_info(xid, tcon, persistent_fid, volatile_fid, buf);
+        rc = SMB2_QFS_info(xid, tcon, fid.persistent_fid, fid.volatile_fid,
-        SMB2_close(xid, tcon, persistent_fid, volatile_fid);
+                           buf);
+        SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
        return rc;
 }
@@ -645,6 +681,7 @@ struct smb_version_operations smb30_operations = {
        .dump_detail = smb2_dump_detail,
        .clear_stats = smb2_clear_stats,
        .print_stats = smb2_print_stats,
+        .dump_share_caps = smb2_dump_share_caps,
        .is_oplock_break = smb2_is_valid_oplock_break,
        .need_neg = smb2_need_neg,
        .negotiate = smb2_negotiate,
@@ -690,6 +727,7 @@ struct smb_version_operations smb30_operations = {
        .get_lease_key = smb2_get_lease_key,
        .set_lease_key = smb2_set_lease_key,
        .new_lease_key = smb2_new_lease_key,
+        .generate_signingkey = generate_smb3signingkey,
        .calc_signature = smb3_calc_signature,
 };
@@ -709,6 +747,8 @@ struct smb_version_values smb20_values = {
        .cap_nt_find = SMB2_NT_FIND,
        .cap_large_files = SMB2_LARGE_FILES,
        .oplock_read = SMB2_OPLOCK_LEVEL_II,
+        .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+        .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
 };
 struct smb_version_values smb21_values = {
@@ -727,6 +767,8 @@ struct smb_version_values smb21_values = {
        .cap_nt_find = SMB2_NT_FIND,
        .cap_large_files = SMB2_LARGE_FILES,
        .oplock_read = SMB2_OPLOCK_LEVEL_II,
+        .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+        .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
 };
 struct smb_version_values smb30_values = {
@@ -745,4 +787,26 @@ struct smb_version_values smb30_values = {
        .cap_nt_find = SMB2_NT_FIND,
        .cap_large_files = SMB2_LARGE_FILES,
        .oplock_read = SMB2_OPLOCK_LEVEL_II,
+        .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+        .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+};
+struct smb_version_values smb302_values = {
+        .version_string = SMB302_VERSION_STRING,
+        .protocol_id = SMB302_PROT_ID,
+        .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+        .large_lock_type = 0,
+        .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+        .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+        .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+        .header_size = sizeof(struct smb2_hdr),
+        .max_header_size = MAX_SMB2_HDR_SIZE,
+        .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+        .lock_cmd = SMB2_LOCK,
+        .cap_unix = 0,
+        .cap_nt_find = SMB2_NT_FIND,
+        .cap_large_files = SMB2_LARGE_FILES,
+        .oplock_read = SMB2_OPLOCK_LEVEL_II,
+        .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+        .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
 };
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 2b95ce2b54e8..abc9c2809b51 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/smb2pdu.c
 *
- *   Copyright (C) International Business Machines  Corp., 2009, 2012
+ *   Copyright (C) International Business Machines  Corp., 2009, 2013
 *                 Etersoft, 2012
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *              Pavel Shilovsky (pshilovsky@samba.org) 2012
@@ -108,19 +108,33 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
        if (!tcon)
                goto out;
+        /* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */
+        /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */
+        /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
+        if ((tcon->ses) &&
+            (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+                hdr->CreditCharge = cpu_to_le16(1);
+        /* else CreditCharge MBZ */
        hdr->TreeId = tcon->tid;
        /* Uid is not converted */
        if (tcon->ses)
                hdr->SessionId = tcon->ses->Suid;
-        /* BB check following DFS flags BB */
-        /* BB do we have to add check for SHI1005_FLAGS_DFS_ROOT too? */
+        /*
-        if (tcon->share_flags & SHI1005_FLAGS_DFS)
+         * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have
-                hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS;
+         * to pass the path on the Open SMB prefixed by \\server\share.
-        /* BB how does SMB2 do case sensitive? */
+         * Not sure when we would need to do the augmented path (if ever) and
-        /* if (tcon->nocase)
+         * setting this flag breaks the SMB2 open operation since it is
-                hdr->Flags |= SMBFLG_CASELESS; */
+         * illegal to send an empty path name (without \\server\share prefix)
-        if (tcon->ses && tcon->ses->server &&
+         * when the DFS flag is set in the SMB open header. We could
-            (tcon->ses->server->sec_mode & SECMODE_SIGN_REQUIRED))
+         * consider setting the flag on all operations other than open
+         * but it is safer to net set it for now.
+         */
+/*      if (tcon->share_flags & SHI1005_FLAGS_DFS)
+                hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */
+        if (tcon->ses && tcon->ses->server && tcon->ses->server->sign)
                hdr->Flags |= SMB2_FLAGS_SIGNED;
 out:
        pdu->StructureSize2 = cpu_to_le16(parmsize);
@@ -328,34 +342,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
        struct kvec iov[1];
        int rc = 0;
        int resp_buftype;
-        struct TCP_Server_Info *server;
+        struct TCP_Server_Info *server = ses->server;
-        unsigned int sec_flags;
-        u16 temp = 0;
        int blob_offset, blob_length;
        char *security_blob;
        int flags = CIFS_NEG_OP;
        cifs_dbg(FYI, "Negotiate protocol\n");
-        if (ses->server)
+        if (!server) {
-                server = ses->server;
+                WARN(1, "%s: server is NULL!\n", __func__);
-        else {
+                return -EIO;
-                rc = -EIO;
-                return rc;
        }
        rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req);
        if (rc)
                return rc;
-        /* if any of auth flags (ie not sign or seal) are overriden use them */
-        if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
-                sec_flags = ses->overrideSecFlg;  /* BB FIXME fix sign flags?*/
-        else /* if override flags set only sign/seal OR them with global auth */
-                sec_flags = global_secflags | ses->overrideSecFlg;
-        cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags);
        req->hdr.SessionId = 0;
        req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
@@ -364,12 +366,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
        inc_rfc1001_len(req, 2);
        /* only one of SMB2 signing flags may be set in SMB2 request */
-        if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN)
+        if (ses->sign)
-                temp = SMB2_NEGOTIATE_SIGNING_REQUIRED;
+                req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
-        else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */
+        else if (global_secflags & CIFSSEC_MAY_SIGN)
-                temp = SMB2_NEGOTIATE_SIGNING_ENABLED;
+                req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
+        else
-        req->SecurityMode = cpu_to_le16(temp);
+                req->SecurityMode = 0;
        req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities);
@@ -399,6 +401,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
                cifs_dbg(FYI, "negotiated smb2.1 dialect\n");
        else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID))
                cifs_dbg(FYI, "negotiated smb3.0 dialect\n");
+        else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID))
+                cifs_dbg(FYI, "negotiated smb3.02 dialect\n");
        else {
                cifs_dbg(VFS, "Illegal dialect returned by server %d\n",
                         le16_to_cpu(rsp->DialectRevision));
@@ -407,6 +411,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
        }
        server->dialect = le16_to_cpu(rsp->DialectRevision);
+        /* SMB2 only has an extended negflavor */
+        server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
        server->maxBuf = le32_to_cpu(rsp->MaxTransactSize);
        server->max_read = le32_to_cpu(rsp->MaxReadSize);
        server->max_write = le32_to_cpu(rsp->MaxWriteSize);
@@ -418,44 +424,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
        security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
                                               &rsp->hdr);
-        if (blob_length == 0) {
+        /*
-                cifs_dbg(VFS, "missing security blob on negprot\n");
+         * See MS-SMB2 section 2.2.4: if no blob, client picks default which
-                rc = -EIO;
+         * for us will be
-                goto neg_exit;
+         *      ses->sectype = RawNTLMSSP;
-        }
+         * but for time being this is our only auth choice so doesn't matter.
+         * We just found a server which sets blob length to zero expecting raw.
-        cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags);
+         */
-        if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
+        if (blob_length == 0)
-                cifs_dbg(FYI, "Signing required\n");
+                cifs_dbg(FYI, "missing security blob on negprot\n");
-                if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED |
-                      SMB2_NEGOTIATE_SIGNING_ENABLED))) {
-                        cifs_dbg(VFS, "signing required but server lacks support\n");
-                        rc = -EOPNOTSUPP;
-                        goto neg_exit;
-                }
-                server->sec_mode |= SECMODE_SIGN_REQUIRED;
-        } else if (sec_flags & CIFSSEC_MAY_SIGN) {
-                cifs_dbg(FYI, "Signing optional\n");
-                if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
-                        cifs_dbg(FYI, "Server requires signing\n");
-                        server->sec_mode |= SECMODE_SIGN_REQUIRED;
-                } else {
-                        server->sec_mode &=
-                                ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
-                }
-        } else {
-                cifs_dbg(FYI, "Signing disabled\n");
-                if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
-                        cifs_dbg(VFS, "Server requires packet signing to be enabled in /proc/fs/cifs/SecurityFlags\n");
-                        rc = -EOPNOTSUPP;
-                        goto neg_exit;
-                }
-                server->sec_mode &=
-                        ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
-        }
+        rc = cifs_enable_signing(server, ses->sign);
 #ifdef CONFIG_SMB2_ASN1  /* BB REMOVEME when updated asn1.c ready */
-        rc = decode_neg_token_init(security_blob, blob_length,
+        if (rc)
+                goto neg_exit;
+        if (blob_length)
+                rc = decode_neg_token_init(security_blob, blob_length,
                                   &server->sec_type);
        if (rc == 1)
                rc = 0;
@@ -480,9 +464,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
        int rc = 0;
        int resp_buftype;
        __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
-        struct TCP_Server_Info *server;
+        struct TCP_Server_Info *server = ses->server;
-        unsigned int sec_flags;
-        u8 temp = 0;
        u16 blob_length = 0;
        char *security_blob;
        char *ntlmssp_blob = NULL;
@@ -490,11 +472,9 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
        cifs_dbg(FYI, "Session Setup\n");
-        if (ses->server)
+        if (!server) {
-                server = ses->server;
+                WARN(1, "%s: server is NULL!\n", __func__);
-        else {
+                return -EIO;
-                rc = -EIO;
-                return rc;
        }
        /*
@@ -505,7 +485,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
        if (!ses->ntlmssp)
                return -ENOMEM;
-        ses->server->secType = RawNTLMSSP;
+        /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */
+        ses->sectype = RawNTLMSSP;
 ssetup_ntlmssp_authenticate:
        if (phase == NtLmChallenge)
@@ -515,28 +496,19 @@ ssetup_ntlmssp_authenticate:
        if (rc)
                return rc;
-        /* if any of auth flags (ie not sign or seal) are overriden use them */
-        if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
-                sec_flags = ses->overrideSecFlg;  /* BB FIXME fix sign flags?*/
-        else /* if override flags set only sign/seal OR them with global auth */
-                sec_flags = global_secflags | ses->overrideSecFlg;
-        cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags);
        req->hdr.SessionId = 0; /* First session, not a reauthenticate */
        req->VcNumber = 0; /* MBZ */
        /* to enable echos and oplocks */
        req->hdr.CreditRequest = cpu_to_le16(3);
        /* only one of SMB2 signing flags may be set in SMB2 request */
-        if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN)
+        if (server->sign)
-                temp = SMB2_NEGOTIATE_SIGNING_REQUIRED;
+                req->SecurityMode = SMB2_NEGOTIATE_SIGNING_REQUIRED;
-        else if (ses->server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED)
+        else if (global_secflags & CIFSSEC_MAY_SIGN) /* one flag unlike MUST_ */
-                temp = SMB2_NEGOTIATE_SIGNING_REQUIRED;
+                req->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED;
-        else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */
+        else
-                temp = SMB2_NEGOTIATE_SIGNING_ENABLED;
+                req->SecurityMode = 0;
-        req->SecurityMode = temp;
        req->Capabilities = 0;
        req->Channel = 0; /* MBZ */
@@ -679,7 +651,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
         /* since no tcon, smb2_init can not do this, so do here */
        req->hdr.SessionId = ses->Suid;
-        if (server->sec_mode & SECMODE_SIGN_REQUIRED)
+        if (server->sign)
                req->hdr.Flags |= SMB2_FLAGS_SIGNED;
        rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0);
@@ -788,11 +760,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
        }
        tcon->share_flags = le32_to_cpu(rsp->ShareFlags);
+        tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */
        tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
        tcon->tidStatus = CifsGood;
        tcon->need_reconnect = false;
        tcon->tid = rsp->hdr.TreeId;
-        strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
+        strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
        if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
            ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
@@ -874,29 +847,76 @@ create_lease_buf(u8 *lease_key, u8 oplock)
        return buf;
 }
+static struct create_durable *
+create_durable_buf(void)
+{
+        struct create_durable *buf;
+        buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
+        if (!buf)
+                return NULL;
+        buf->ccontext.DataOffset = cpu_to_le16(offsetof
+                                        (struct create_durable, Data));
+        buf->ccontext.DataLength = cpu_to_le32(16);
+        buf->ccontext.NameOffset = cpu_to_le16(offsetof
+                                (struct create_durable, Name));
+        buf->ccontext.NameLength = cpu_to_le16(4);
+        buf->Name[0] = 'D';
+        buf->Name[1] = 'H';
+        buf->Name[2] = 'n';
+        buf->Name[3] = 'Q';
+        return buf;
+}
+static struct create_durable *
+create_reconnect_durable_buf(struct cifs_fid *fid)
+{
+        struct create_durable *buf;
+        buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
+        if (!buf)
+                return NULL;
+        buf->ccontext.DataOffset = cpu_to_le16(offsetof
+                                        (struct create_durable, Data));
+        buf->ccontext.DataLength = cpu_to_le32(16);
+        buf->ccontext.NameOffset = cpu_to_le16(offsetof
+                                (struct create_durable, Name));
+        buf->ccontext.NameLength = cpu_to_le16(4);
+        buf->Data.Fid.PersistentFileId = fid->persistent_fid;
+        buf->Data.Fid.VolatileFileId = fid->volatile_fid;
+        buf->Name[0] = 'D';
+        buf->Name[1] = 'H';
+        buf->Name[2] = 'n';
+        buf->Name[3] = 'C';
+        return buf;
+}
 static __u8
 parse_lease_state(struct smb2_create_rsp *rsp)
 {
        char *data_offset;
        struct create_lease *lc;
        bool found = false;
+        unsigned int next = 0;
+        char *name;
-        data_offset = (char *)rsp;
+        data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
-        data_offset += 4 + le32_to_cpu(rsp->CreateContextsOffset);
        lc = (struct create_lease *)data_offset;
        do {
-                char *name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc;
+                lc = (struct create_lease *)((char *)lc + next);
+                name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc;
                if (le16_to_cpu(lc->ccontext.NameLength) != 4 ||
                    strncmp(name, "RqLs", 4)) {
-                        lc = (struct create_lease *)((char *)lc
+                        next = le32_to_cpu(lc->ccontext.Next);
-                                        + le32_to_cpu(lc->ccontext.Next));
                        continue;
                }
                if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
                        return SMB2_OPLOCK_LEVEL_NOCHANGE;
                found = true;
                break;
-        } while (le32_to_cpu(lc->ccontext.Next) != 0);
+        } while (next != 0);
        if (!found)
                return 0;
@@ -904,23 +924,74 @@ parse_lease_state(struct smb2_create_rsp *rsp)
        return smb2_map_lease_to_oplock(lc->lcontext.LeaseState);
 }
+static int
+add_lease_context(struct kvec *iov, unsigned int *num_iovec, __u8 *oplock)
+{
+        struct smb2_create_req *req = iov[0].iov_base;
+        unsigned int num = *num_iovec;
+        iov[num].iov_base = create_lease_buf(oplock+1, *oplock);
+        if (iov[num].iov_base == NULL)
+                return -ENOMEM;
+        iov[num].iov_len = sizeof(struct create_lease);
+        req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
+        if (!req->CreateContextsOffset)
+                req->CreateContextsOffset = cpu_to_le32(
+                                sizeof(struct smb2_create_req) - 4 +
+                                iov[num - 1].iov_len);
+        req->CreateContextsLength = cpu_to_le32(
+                                le32_to_cpu(req->CreateContextsLength) +
+                                sizeof(struct create_lease));
+        inc_rfc1001_len(&req->hdr, sizeof(struct create_lease));
+        *num_iovec = num + 1;
+        return 0;
+}
+static int
+add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+                    struct cifs_open_parms *oparms)
+{
+        struct smb2_create_req *req = iov[0].iov_base;
+        unsigned int num = *num_iovec;
+        if (oparms->reconnect) {
+                iov[num].iov_base = create_reconnect_durable_buf(oparms->fid);
+                /* indicate that we don't need to relock the file */
+                oparms->reconnect = false;
+        } else
+                iov[num].iov_base = create_durable_buf();
+        if (iov[num].iov_base == NULL)
+                return -ENOMEM;
+        iov[num].iov_len = sizeof(struct create_durable);
+        if (!req->CreateContextsOffset)
+                req->CreateContextsOffset =
+                        cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+                                                                iov[1].iov_len);
+        req->CreateContextsLength =
+                        cpu_to_le32(le32_to_cpu(req->CreateContextsLength) +
+                                                sizeof(struct create_durable));
+        inc_rfc1001_len(&req->hdr, sizeof(struct create_durable));
+        *num_iovec = num + 1;
+        return 0;
+}
 int
-SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
+SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
-          u64 *persistent_fid, u64 *volatile_fid, __u32 desired_access,
-          __u32 create_disposition, __u32 file_attributes, __u32 create_options,
          __u8 *oplock, struct smb2_file_all_info *buf)
 {
        struct smb2_create_req *req;
        struct smb2_create_rsp *rsp;
        struct TCP_Server_Info *server;
+        struct cifs_tcon *tcon = oparms->tcon;
        struct cifs_ses *ses = tcon->ses;
-        struct kvec iov[3];
+        struct kvec iov[4];
        int resp_buftype;
        int uni_path_len;
        __le16 *copy_path = NULL;
        int copy_size;
        int rc = 0;
-        int num_iovecs = 2;
+        unsigned int num_iovecs = 2;
+        __u32 file_attributes = 0;
        cifs_dbg(FYI, "create/open\n");
@@ -933,55 +1004,47 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
        if (rc)
                return rc;
+        if (oparms->create_options & CREATE_OPTION_READONLY)
+                file_attributes |= ATTR_READONLY;
        req->ImpersonationLevel = IL_IMPERSONATION;
-        req->DesiredAccess = cpu_to_le32(desired_access);
+        req->DesiredAccess = cpu_to_le32(oparms->desired_access);
        /* File attributes ignored on open (used in create though) */
        req->FileAttributes = cpu_to_le32(file_attributes);
        req->ShareAccess = FILE_SHARE_ALL_LE;
-        req->CreateDisposition = cpu_to_le32(create_disposition);
+        req->CreateDisposition = cpu_to_le32(oparms->disposition);
-        req->CreateOptions = cpu_to_le32(create_options);
+        req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK);
        uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2;
-        req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)
+        /* do not count rfc1001 len field */
-                        - 8 /* pad */ - 4 /* do not count rfc1001 len field */);
+        req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - 4);
        iov[0].iov_base = (char *)req;
        /* 4 for rfc1002 length field */
        iov[0].iov_len = get_rfc1002_length(req) + 4;
        /* MUST set path len (NameLength) to 0 opening root of share */
-        if (uni_path_len >= 4) {
+        req->NameLength = cpu_to_le16(uni_path_len - 2);
-                req->NameLength = cpu_to_le16(uni_path_len - 2);
+        /* -1 since last byte is buf[0] which is sent below (path) */
-                /* -1 since last byte is buf[0] which is sent below (path) */
+        iov[0].iov_len--;
-                iov[0].iov_len--;
+        if (uni_path_len % 8 != 0) {
-                if (uni_path_len % 8 != 0) {
+                copy_size = uni_path_len / 8 * 8;
-                        copy_size = uni_path_len / 8 * 8;
+                if (copy_size < uni_path_len)
-                        if (copy_size < uni_path_len)
+                        copy_size += 8;
-                                copy_size += 8;
+                copy_path = kzalloc(copy_size, GFP_KERNEL);
-                        copy_path = kzalloc(copy_size, GFP_KERNEL);
+                if (!copy_path)
-                        if (!copy_path)
+                        return -ENOMEM;
-                                return -ENOMEM;
+                memcpy((char *)copy_path, (const char *)path,
-                        memcpy((char *)copy_path, (const char *)path,
+                        uni_path_len);
-                                uni_path_len);
+                uni_path_len = copy_size;
-                        uni_path_len = copy_size;
+                path = copy_path;
-                        path = copy_path;
-                }
-                iov[1].iov_len = uni_path_len;
-                iov[1].iov_base = path;
-                /*
-                 * -1 since last byte is buf[0] which was counted in
-                 * smb2_buf_len.
-                 */
-                inc_rfc1001_len(req, uni_path_len - 1);
-        } else {
-                iov[0].iov_len += 7;
-                req->hdr.smb2_buf_length = cpu_to_be32(be32_to_cpu(
-                                req->hdr.smb2_buf_length) + 8 - 1);
-                num_iovecs = 1;
-                req->NameLength = 0;
        }
+        iov[1].iov_len = uni_path_len;
+        iov[1].iov_base = path;
+        /* -1 since last byte is buf[0] which was counted in smb2_buf_len */
+        inc_rfc1001_len(req, uni_path_len - 1);
        if (!server->oplocks)
                *oplock = SMB2_OPLOCK_LEVEL_NONE;
@@ -989,21 +1052,29 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
            *oplock == SMB2_OPLOCK_LEVEL_NONE)
                req->RequestedOplockLevel = *oplock;
        else {
-                iov[num_iovecs].iov_base = create_lease_buf(oplock+1, *oplock);
+                rc = add_lease_context(iov, &num_iovecs, oplock);
-                if (iov[num_iovecs].iov_base == NULL) {
+                if (rc) {
                        cifs_small_buf_release(req);
                        kfree(copy_path);
-                        return -ENOMEM;
+                        return rc;
+                }
+        }
+        if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) {
+                /* need to set Next field of lease context if we request it */
+                if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) {
+                        struct create_context *ccontext =
+                            (struct create_context *)iov[num_iovecs-1].iov_base;
+                        ccontext->Next =
+                                cpu_to_le32(sizeof(struct create_lease));
+                }
+                rc = add_durable_context(iov, &num_iovecs, oparms);
+                if (rc) {
+                        cifs_small_buf_release(req);
+                        kfree(copy_path);
+                        kfree(iov[num_iovecs-1].iov_base);
+                        return rc;
                }
-                iov[num_iovecs].iov_len = sizeof(struct create_lease);
-                req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
-                req->CreateContextsOffset = cpu_to_le32(
-                        sizeof(struct smb2_create_req) - 4 - 8 +
-                        iov[num_iovecs-1].iov_len);
-                req->CreateContextsLength = cpu_to_le32(
-                        sizeof(struct create_lease));
-                inc_rfc1001_len(&req->hdr, sizeof(struct create_lease));
-                num_iovecs++;
        }
        rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
@@ -1014,8 +1085,8 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
                goto creat_exit;
        }
-        *persistent_fid = rsp->PersistentFileId;
+        oparms->fid->persistent_fid = rsp->PersistentFileId;
-        *volatile_fid = rsp->VolatileFileId;
+        oparms->fid->volatile_fid = rsp->VolatileFileId;
        if (buf) {
                memcpy(buf, &rsp->CreationTime, 32);
@@ -1036,6 +1107,122 @@ creat_exit:
        return rc;
 }
+/*
+ *      SMB2 IOCTL is used for both IOCTLs and FSCTLs
+ */
+int
+SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+           u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data,
+           u32 indatalen, char **out_data, u32 *plen /* returned data len */)
+{
+        struct smb2_ioctl_req *req;
+        struct smb2_ioctl_rsp *rsp;
+        struct TCP_Server_Info *server;
+        struct cifs_ses *ses = tcon->ses;
+        struct kvec iov[2];
+        int resp_buftype;
+        int num_iovecs;
+        int rc = 0;
+        cifs_dbg(FYI, "SMB2 IOCTL\n");
+        /* zero out returned data len, in case of error */
+        if (plen)
+                *plen = 0;
+        if (ses && (ses->server))
+                server = ses->server;
+        else
+                return -EIO;
+        rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req);
+        if (rc)
+                return rc;
+        req->CtlCode = cpu_to_le32(opcode);
+        req->PersistentFileId = persistent_fid;
+        req->VolatileFileId = volatile_fid;
+        if (indatalen) {
+                req->InputCount = cpu_to_le32(indatalen);
+                /* do not set InputOffset if no input data */
+                req->InputOffset =
+                       cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4);
+                iov[1].iov_base = in_data;
+                iov[1].iov_len = indatalen;
+                num_iovecs = 2;
+        } else
+                num_iovecs = 1;
+        req->OutputOffset = 0;
+        req->OutputCount = 0; /* MBZ */
+        /*
+         * Could increase MaxOutputResponse, but that would require more
+         * than one credit. Windows typically sets this smaller, but for some
+         * ioctls it may be useful to allow server to send more. No point
+         * limiting what the server can send as long as fits in one credit
+         */
+        req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */
+        if (is_fsctl)
+                req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
+        else
+                req->Flags = 0;
+        iov[0].iov_base = (char *)req;
+        /* 4 for rfc1002 length field */
+        iov[0].iov_len = get_rfc1002_length(req) + 4;
+        if (indatalen)
+                inc_rfc1001_len(req, indatalen);
+        rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
+        rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
+        if (rc != 0) {
+                if (tcon)
+                        cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+                goto ioctl_exit;
+        }
+        /* check if caller wants to look at return data or just return rc */
+        if ((plen == NULL) || (out_data == NULL))
+                goto ioctl_exit;
+        *plen = le32_to_cpu(rsp->OutputCount);
+        /* We check for obvious errors in the output buffer length and offset */
+        if (*plen == 0)
+                goto ioctl_exit; /* server returned no data */
+        else if (*plen > 0xFF00) {
+                cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen);
+                *plen = 0;
+                rc = -EIO;
+                goto ioctl_exit;
+        }
+        if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) {
+                cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
+                        le32_to_cpu(rsp->OutputOffset));
+                *plen = 0;
+                rc = -EIO;
+                goto ioctl_exit;
+        }
+        *out_data = kmalloc(*plen, GFP_KERNEL);
+        if (*out_data == NULL) {
+                rc = -ENOMEM;
+                goto ioctl_exit;
+        }
+        memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
+               *plen);
+ioctl_exit:
+        free_rsp_buf(resp_buftype, rsp);
+        return rc;
+}
 int
 SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
           u64 persistent_fid, u64 volatile_fid)
@@ -1384,8 +1571,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
        case MID_RESPONSE_RECEIVED:
                credits_received = le16_to_cpu(buf->CreditRequest);
                /* result already set, check signature */
-                if (server->sec_mode &
+                if (server->sign) {
-                    (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                        int rc;
                        rc = smb2_verify_signature(&rqst, server);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 4cb4ced258cb..36b0d37ea69b 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/smb2pdu.h
 *
- *   Copyright (c) International Business Machines  Corp., 2009, 2010
+ *   Copyright (c) International Business Machines  Corp., 2009, 2013
 *                 Etersoft, 2012
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *              Pavel Shilovsky (pshilovsky@samba.org) 2012
@@ -170,6 +170,7 @@ struct smb2_negotiate_req {
 #define SMB20_PROT_ID 0x0202
 #define SMB21_PROT_ID 0x0210
 #define SMB30_PROT_ID 0x0300
+#define SMB302_PROT_ID 0x0302
 #define BAD_PROT_ID   0xFFFF
 /* SecurityMode flags */
@@ -283,10 +284,17 @@ struct smb2_tree_connect_rsp {
 #define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING           0x00000400
 #define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM       0x00000800
 #define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK              0x00001000
-#define SHI1005_FLAGS_ENABLE_HASH                       0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V1                    0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V2                    0x00004000
+#define SHI1005_FLAGS_ENCRYPT_DATA                      0x00008000
+#define SHI1005_FLAGS_ALL                               0x0000FF33
 /* Possible share capabilities */
-#define SMB2_SHARE_CAP_DFS      cpu_to_le32(0x00000008)
+#define SMB2_SHARE_CAP_DFS      cpu_to_le32(0x00000008) /* all dialects */
+#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
+#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */
+#define SMB2_SHARE_CAP_CLUSTER  cpu_to_le32(0x00000040) /* 3.0 */
+#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
 struct smb2_tree_disconnect_req {
        struct smb2_hdr hdr;
@@ -420,7 +428,7 @@ struct smb2_create_req {
        __le16 NameLength;
        __le32 CreateContextsOffset;
        __le32 CreateContextsLength;
-        __u8   Buffer[8];
+        __u8   Buffer[0];
 } __packed;
 struct smb2_create_rsp {
@@ -477,6 +485,87 @@ struct create_lease {
        struct lease_context lcontext;
 } __packed;
+struct create_durable {
+        struct create_context ccontext;
+        __u8   Name[8];
+        union {
+                __u8  Reserved[16];
+                struct {
+                        __u64 PersistentFileId;
+                        __u64 VolatileFileId;
+                } Fid;
+        } Data;
+} __packed;
+/* this goes in the ioctl buffer when doing a copychunk request */
+struct copychunk_ioctl {
+        char SourceKey[24];
+        __le32 ChunkCount; /* we are only sending 1 */
+        __le32 Reserved;
+        /* array will only be one chunk long for us */
+        __le64 SourceOffset;
+        __le64 TargetOffset;
+        __le32 Length; /* how many bytes to copy */
+        __u32 Reserved2;
+} __packed;
+/* Response and Request are the same format */
+struct validate_negotiate_info {
+        __le32 Capabilities;
+        __u8   Guid[SMB2_CLIENT_GUID_SIZE];
+        __le16 SecurityMode;
+        __le16 DialectCount;
+        __le16 Dialect[1];
+} __packed;
+#define RSS_CAPABLE     0x00000001
+#define RDMA_CAPABLE    0x00000002
+struct network_interface_info_ioctl_rsp {
+        __le32 Next; /* next interface. zero if this is last one */
+        __le32 IfIndex;
+        __le32 Capability; /* RSS or RDMA Capable */
+        __le32 Reserved;
+        __le64 LinkSpeed;
+        char    SockAddr_Storage[128];
+} __packed;
+#define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */
+struct smb2_ioctl_req {
+        struct smb2_hdr hdr;
+        __le16 StructureSize;   /* Must be 57 */
+        __u16 Reserved;
+        __le32 CtlCode;
+        __u64  PersistentFileId; /* opaque endianness */
+        __u64  VolatileFileId; /* opaque endianness */
+        __le32 InputOffset;
+        __le32 InputCount;
+        __le32 MaxInputResponse;
+        __le32 OutputOffset;
+        __le32 OutputCount;
+        __le32 MaxOutputResponse;
+        __le32 Flags;
+        __u32  Reserved2;
+        char   Buffer[0];
+} __packed;
+struct smb2_ioctl_rsp {
+        struct smb2_hdr hdr;
+        __le16 StructureSize;   /* Must be 57 */
+        __u16 Reserved;
+        __le32 CtlCode;
+        __u64  PersistentFileId; /* opaque endianness */
+        __u64  VolatileFileId; /* opaque endianness */
+        __le32 InputOffset;
+        __le32 InputCount;
+        __le32 OutputOffset;
+        __le32 OutputCount;
+        __le32 Flags;
+        __u32  Reserved2;
+        /* char * buffer[] */
+} __packed;
 /* Currently defined values for close flags */
 #define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB        cpu_to_le16(0x0001)
 struct smb2_close_req {
@@ -517,17 +606,25 @@ struct smb2_flush_rsp {
        __le16 Reserved;
 } __packed;
+/* For read request Flags field below, following flag is defined for SMB3.02 */
+#define SMB2_READFLAG_READ_UNBUFFERED   0x01
+/* Channel field for read and write: exactly one of following flags can be set*/
+#define SMB2_CHANNEL_NONE               0x00000000
+#define SMB2_CHANNEL_RDMA_V1            0x00000001 /* SMB3 or later */
+#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */
 struct smb2_read_req {
        struct smb2_hdr hdr;
        __le16 StructureSize; /* Must be 49 */
        __u8   Padding; /* offset from start of SMB2 header to place read */
-        __u8   Reserved;
+        __u8   Flags; /* MBZ unless SMB3.02 or later */
        __le32 Length;
        __le64 Offset;
        __u64  PersistentFileId; /* opaque endianness */
        __u64  VolatileFileId; /* opaque endianness */
        __le32 MinimumCount;
-        __le32 Channel; /* Reserved MBZ */
+        __le32 Channel; /* MBZ except for SMB3 or later */
        __le32 RemainingBytes;
        __le16 ReadChannelInfoOffset; /* Reserved MBZ */
        __le16 ReadChannelInfoLength; /* Reserved MBZ */
@@ -545,8 +642,9 @@ struct smb2_read_rsp {
        __u8   Buffer[1];
 } __packed;
-/* For write request Flags field below the following flag is defined: */
+/* For write request Flags field below the following flags are defined: */
-#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001
+#define SMB2_WRITEFLAG_WRITE_THROUGH    0x00000001      /* SMB2.1 or later */
+#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002      /* SMB3.02 or later */
 struct smb2_write_req {
        struct smb2_hdr hdr;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 2aa3535e38ce..1a5ecbed40ed 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -84,11 +84,9 @@ extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
                                const char *from_name, const char *to_name,
                                struct cifs_sb_info *cifs_sb);
-extern int smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon,
+extern int smb2_open_file(const unsigned int xid,
-                          const char *full_path, int disposition,
+                          struct cifs_open_parms *oparms,
-                          int desired_access, int create_options,
+                          __u32 *oplock, FILE_ALL_INFO *buf);
-                          struct cifs_fid *fid, __u32 *oplock,
-                          FILE_ALL_INFO *buf, struct cifs_sb_info *cifs_sb);
 extern void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
 extern int smb2_unlock_range(struct cifsFileInfo *cfile,
                             struct file_lock *flock, const unsigned int xid);
@@ -106,11 +104,13 @@ extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses,
                     const char *tree, struct cifs_tcon *tcon,
                     const struct nls_table *);
 extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon);
-extern int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon,
+extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms,
-                     __le16 *path, u64 *persistent_fid, u64 *volatile_fid,
+                     __le16 *path, __u8 *oplock,
-                     __u32 desired_access, __u32 create_disposition,
+                     struct smb2_file_all_info *buf);
-                     __u32 file_attributes, __u32 create_options,
+extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
-                     __u8 *oplock, struct smb2_file_all_info *buf);
+                     u64 persistent_fid, u64 volatile_fid, u32 opcode,
+                     bool is_fsctl, char *in_data, u32 indatalen,
+                     char **out_data, u32 *plen /* returned data len */);
 extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
                      u64 persistent_file_id, u64 volatile_file_id);
 extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 01f0ac800780..4f2300d020c7 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -39,6 +39,82 @@
 #include "smb2status.h"
 #include "smb2glob.h"
+static int
+smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+        int rc;
+        unsigned int size;
+        if (server->secmech.sdeschmacsha256 != NULL)
+                return 0; /* already allocated */
+        server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
+        if (IS_ERR(server->secmech.hmacsha256)) {
+                cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
+                rc = PTR_ERR(server->secmech.hmacsha256);
+                server->secmech.hmacsha256 = NULL;
+                return rc;
+        }
+        size = sizeof(struct shash_desc) +
+                        crypto_shash_descsize(server->secmech.hmacsha256);
+        server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
+        if (!server->secmech.sdeschmacsha256) {
+                crypto_free_shash(server->secmech.hmacsha256);
+                server->secmech.hmacsha256 = NULL;
+                return -ENOMEM;
+        }
+        server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
+        server->secmech.sdeschmacsha256->shash.flags = 0x0;
+        return 0;
+}
+static int
+smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+        unsigned int size;
+        int rc;
+        if (server->secmech.sdesccmacaes != NULL)
+                return 0;  /* already allocated */
+        rc = smb2_crypto_shash_allocate(server);
+        if (rc)
+                return rc;
+        server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0);
+        if (IS_ERR(server->secmech.cmacaes)) {
+                cifs_dbg(VFS, "could not allocate crypto cmac-aes");
+                kfree(server->secmech.sdeschmacsha256);
+                server->secmech.sdeschmacsha256 = NULL;
+                crypto_free_shash(server->secmech.hmacsha256);
+                server->secmech.hmacsha256 = NULL;
+                rc = PTR_ERR(server->secmech.cmacaes);
+                server->secmech.cmacaes = NULL;
+                return rc;
+        }
+        size = sizeof(struct shash_desc) +
+                        crypto_shash_descsize(server->secmech.cmacaes);
+        server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL);
+        if (!server->secmech.sdesccmacaes) {
+                cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__);
+                kfree(server->secmech.sdeschmacsha256);
+                server->secmech.sdeschmacsha256 = NULL;
+                crypto_free_shash(server->secmech.hmacsha256);
+                crypto_free_shash(server->secmech.cmacaes);
+                server->secmech.hmacsha256 = NULL;
+                server->secmech.cmacaes = NULL;
+                return -ENOMEM;
+        }
+        server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes;
+        server->secmech.sdesccmacaes->shash.flags = 0x0;
+        return 0;
+}
 int
 smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
@@ -52,6 +128,12 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
        memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
        memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
+        rc = smb2_crypto_shash_allocate(server);
+        if (rc) {
+                cifs_dbg(VFS, "%s: shah256 alloc failed\n", __func__);
+                return rc;
+        }
        rc = crypto_shash_setkey(server->secmech.hmacsha256,
                server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
        if (rc) {
@@ -61,7 +143,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
        rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
        if (rc) {
-                cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
+                cifs_dbg(VFS, "%s: Could not init sha256", __func__);
                return rc;
        }
@@ -116,11 +198,166 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
        return rc;
 }
+void
+generate_smb3signingkey(struct TCP_Server_Info *server)
+{
+        unsigned char zero = 0x0;
+        __u8 i[4] = {0, 0, 0, 1};
+        __u8 L[4] = {0, 0, 0, 128};
+        int rc = 0;
+        unsigned char prfhash[SMB2_HMACSHA256_SIZE];
+        unsigned char *hashptr = prfhash;
+        memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
+        memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
+        rc = smb3_crypto_shash_allocate(server);
+        if (rc) {
+                cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
+                goto smb3signkey_ret;
+        }
+        rc = crypto_shash_setkey(server->secmech.hmacsha256,
+                server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
+        if (rc) {
+                cifs_dbg(VFS, "%s: Could not set with session key\n", __func__);
+                goto smb3signkey_ret;
+        }
+        rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
+        if (rc) {
+                cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
+                goto smb3signkey_ret;
+        }
+        rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+                                i, 4);
+        if (rc) {
+                cifs_dbg(VFS, "%s: Could not update with n\n", __func__);
+                goto smb3signkey_ret;
+        }
+        rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+                                "SMB2AESCMAC", 12);
+        if (rc) {
+                cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
+                goto smb3signkey_ret;
+        }
+        rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+                                &zero, 1);
+        if (rc) {
+                cifs_dbg(VFS, "%s: Could not update with zero\n", __func__);
+                goto smb3signkey_ret;
+        }
+        rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+                                "SmbSign", 8);
+        if (rc) {
+                cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
+                goto smb3signkey_ret;
+        }
+        rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+                                L, 4);
+        if (rc) {
+                cifs_dbg(VFS, "%s: Could not update with L\n", __func__);
+                goto smb3signkey_ret;
+        }
+        rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
+                                hashptr);
+        if (rc) {
+                cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
+                goto smb3signkey_ret;
+        }
+        memcpy(server->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
+smb3signkey_ret:
+        return;
+}
 int
 smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
-        cifs_dbg(FYI, "smb3 signatures not supported yet\n");
+        int i, rc;
-        return -EOPNOTSUPP;
+        unsigned char smb3_signature[SMB2_CMACAES_SIZE];
+        unsigned char *sigptr = smb3_signature;
+        struct kvec *iov = rqst->rq_iov;
+        int n_vec = rqst->rq_nvec;
+        struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
+        memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
+        memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
+        rc = crypto_shash_setkey(server->secmech.cmacaes,
+                server->smb3signingkey, SMB2_CMACAES_SIZE);
+        if (rc) {
+                cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
+                return rc;
+        }
+        /*
+         * we already allocate sdesccmacaes when we init smb3 signing key,
+         * so unlike smb2 case we do not have to check here if secmech are
+         * initialized
+         */
+        rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash);
+        if (rc) {
+                cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
+                return rc;
+        }
+        for (i = 0; i < n_vec; i++) {
+                if (iov[i].iov_len == 0)
+                        continue;
+                if (iov[i].iov_base == NULL) {
+                        cifs_dbg(VFS, "null iovec entry");
+                        return -EIO;
+                }
+                /*
+                 * The first entry includes a length field (which does not get
+                 * signed that occupies the first 4 bytes before the header).
+                 */
+                if (i == 0) {
+                        if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
+                                break; /* nothing to sign or corrupt header */
+                        rc =
+                        crypto_shash_update(
+                                &server->secmech.sdesccmacaes->shash,
+                                iov[i].iov_base + 4, iov[i].iov_len - 4);
+                } else {
+                        rc =
+                        crypto_shash_update(
+                                &server->secmech.sdesccmacaes->shash,
+                                iov[i].iov_base, iov[i].iov_len);
+                }
+                if (rc) {
+                        cifs_dbg(VFS, "%s: Couldn't update cmac aes with payload\n",
+                                                        __func__);
+                        return rc;
+                }
+        }
+        /* now hash over the rq_pages array */
+        for (i = 0; i < rqst->rq_npages; i++) {
+                struct kvec p_iov;
+                cifs_rqst_page_to_kvec(rqst, i, &p_iov);
+                crypto_shash_update(&server->secmech.sdesccmacaes->shash,
+                                        p_iov.iov_base, p_iov.iov_len);
+                kunmap(rqst->rq_pages[i]);
+        }
+        rc = crypto_shash_final(&server->secmech.sdesccmacaes->shash,
+                                                sigptr);
+        if (rc)
+                cifs_dbg(VFS, "%s: Could not generate cmac aes\n", __func__);
+        memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+        return rc;
 }
 /* must be called with server->srv_mutex held */
@@ -275,8 +512,7 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
        dump_smb(mid->resp_buf, min_t(u32, 80, len));
        /* convert the length into a more usable form */
-        if ((len > 24) &&
+        if (len > 24 && server->sign) {
-            (server->sec_mode & (SECMODE_SIGN_REQUIRED|SECMODE_SIGN_ENABLED))) {
                int rc;
                rc = smb2_verify_signature(&rqst, server);
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 7056b891e087..d952ee48f4dc 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
 *
- *   Copyright (c) International Business Machines  Corp., 2002,2009
+ *   Copyright (c) International Business Machines  Corp., 2002,2013
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -22,7 +22,7 @@
 /* IOCTL information */
 /*
 * List of ioctl/fsctl function codes that are or could be useful in the
- * future to remote clients like cifs or SMB2 client.  There is probably
+ * future to remote clients like cifs or SMB2/SMB3 client.  This is probably
 * a slightly larger set of fsctls that NTFS local filesystem could handle,
 * including the seven below that we do not have struct definitions for.
 * Even with protocol definitions for most of these now available, we still
@@ -30,7 +30,13 @@
 * remotely.  Some of the following, such as the encryption/compression ones
 * could be invoked from tools via a specialized hook into the VFS rather
 * than via the standard vfs entry points
+ *
+ * See MS-SMB2 Section 2.2.31 (last checked June 2013, all of that list are
+ * below). Additional detail on less common ones can be found in MS-FSCC
+ * section 2.3.
 */
+#define FSCTL_DFS_GET_REFERRALS      0x00060194
+#define FSCTL_DFS_GET_REFERRALS_EX   0x000601B0
 #define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
 #define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
 #define FSCTL_REQUEST_BATCH_OPLOCK   0x00090008
@@ -71,14 +77,31 @@
 #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
 #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
 #define FSCTL_SET_DEFECT_MANAGEMENT  0x00098134 /* BB add struct */
+#define FSCTL_FILE_LEVEL_TRIM        0x00098208 /* BB add struct */
 #define FSCTL_SIS_LINK_FILES         0x0009C104
 #define FSCTL_PIPE_PEEK              0x0011400C /* BB add struct */
 #define FSCTL_PIPE_TRANSCEIVE        0x0011C017 /* BB add struct */
 /* strange that the number for this op is not sequential with previous op */
 #define FSCTL_PIPE_WAIT              0x00110018 /* BB add struct */
+/* Enumerate previous versions of a file */
+#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
+/* Retrieve an opaque file reference for server-side data movement ie copy */
+#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
+#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
 #define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
 #define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
+#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */
+/* Perform server-side data movement */
+#define FSCTL_SRV_COPYCHUNK 0x001440F2
+#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2
+#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */
+#define FSCTL_SRV_READ_HASH          0x001441BB /* BB add struct */
 #define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
 #define IO_REPARSE_TAG_HSM           0xC0000004
 #define IO_REPARSE_TAG_SIS           0x80000007
+/* fsctl flags */
+/* If Flags is set to this value, the request is an FSCTL not ioctl request */
+#define SMB2_0_IOCTL_IS_FSCTL           0x00000001
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index bfbf4700d160..6fdcb1b4a106 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -447,7 +447,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
 {
        int error;
-        error = wait_event_freezekillable(server->response_q,
+        error = wait_event_freezekillable_unsafe(server->response_q,
                                    midQ->mid_state != MID_REQUEST_SUBMITTED);
        if (error < 0)
                return -ERESTARTSYS;
@@ -463,7 +463,7 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
        struct mid_q_entry *mid;
        /* enable signing if server requires it */
-        if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+        if (server->sign)
                hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
        mid = AllocMidQEntry(hdr, server);
@@ -612,7 +612,7 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
        dump_smb(mid->resp_buf, min_t(u32, 92, len));
        /* convert the length into a more usable form */
-        if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+        if (server->sign) {
                struct kvec iov;
                int rc = 0;
                struct smb_rqst rqst = { .rq_iov = &iov,
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index b7d3a05c062c..190effc6a6fa 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -43,15 +43,14 @@ static int coda_rename(struct inode *old_inode, struct dentry *old_dentry,
                       struct inode *new_inode, struct dentry *new_dentry);
 /* dir file-ops */
-static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
+static int coda_readdir(struct file *file, struct dir_context *ctx);
 /* dentry ops */
 static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
 static int coda_dentry_delete(const struct dentry *);
 /* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
+static int coda_venus_readdir(struct file *, struct dir_context *);
-                              filldir_t filldir);
 /* same as fs/bad_inode.c */
 static int coda_return_EIO(void)
@@ -85,7 +84,7 @@ const struct inode_operations coda_dir_inode_operations =
 const struct file_operations coda_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = coda_readdir,
+        .iterate        = coda_readdir,
        .open           = coda_open,
        .release        = coda_release,
        .fsync          = coda_fsync,
@@ -378,7 +377,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 /* file operations for directories */
-static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
+static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
 {
        struct coda_file_info *cfi;
        struct file *host_file;
@@ -391,30 +390,19 @@ static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
        if (!host_file->f_op)
                return -ENOTDIR;
-        if (host_file->f_op->readdir)
+        if (host_file->f_op->iterate) {
-        {
-                /* potemkin case: we were handed a directory inode.
-                 * We can't use vfs_readdir because we have to keep the file
-                 * position in sync between the coda_file and the host_file.
-                 * and as such we need grab the inode mutex. */
                struct inode *host_inode = file_inode(host_file);
                mutex_lock(&host_inode->i_mutex);
-                host_file->f_pos = coda_file->f_pos;
                ret = -ENOENT;
                if (!IS_DEADDIR(host_inode)) {
-                        ret = host_file->f_op->readdir(host_file, buf, filldir);
+                        ret = host_file->f_op->iterate(host_file, ctx);
                        file_accessed(host_file);
                }
-                coda_file->f_pos = host_file->f_pos;
                mutex_unlock(&host_inode->i_mutex);
+                return ret;
        }
-        else /* Venus: we must read Venus dirents from a file */
+        /* Venus: we must read Venus dirents from a file */
-                ret = coda_venus_readdir(coda_file, buf, filldir);
+        return coda_venus_readdir(coda_file, ctx);
-        return ret;
 }
 static inline unsigned int CDT2DT(unsigned char cdt)
@@ -437,10 +425,8 @@ static inline unsigned int CDT2DT(unsigned char cdt)
 }
 /* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
+static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
-                              filldir_t filldir)
 {
-        int result = 0; /* # of entries returned */
        struct coda_file_info *cfi;
        struct coda_inode_info *cii;
        struct file *host_file;
@@ -462,23 +448,12 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
        vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
        if (!vdir) return -ENOMEM;
-        if (coda_file->f_pos == 0) {
+        if (!dir_emit_dots(coda_file, ctx))
-                ret = filldir(buf, ".", 1, 0, de->d_inode->i_ino, DT_DIR);
+                goto out;
-                if (ret < 0)
-                        goto out;
-                result++;
-                coda_file->f_pos++;
-        }
-        if (coda_file->f_pos == 1) {
-                ret = filldir(buf, "..", 2, 1, parent_ino(de), DT_DIR);
-                if (ret < 0)
-                        goto out;
-                result++;
-                coda_file->f_pos++;
-        }
        while (1) {
                /* read entries from the directory file */
-                ret = kernel_read(host_file, coda_file->f_pos - 2, (char *)vdir,
+                ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir,
                                  sizeof(*vdir));
                if (ret < 0) {
                        printk(KERN_ERR "coda readdir: read dir %s failed %d\n",
@@ -507,32 +482,23 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
                /* Make sure we skip '.' and '..', we already got those */
                if (name.name[0] == '.' && (name.len == 1 ||
-                    (vdir->d_name[1] == '.' && name.len == 2)))
+                    (name.name[1] == '.' && name.len == 2)))
                        vdir->d_fileno = name.len = 0;
                /* skip null entries */
                if (vdir->d_fileno && name.len) {
-                        /* try to look up this entry in the dcache, that way
+                        ino = vdir->d_fileno;
-                         * userspace doesn't have to worry about breaking
-                         * getcwd by having mismatched inode numbers for
-                         * internal volume mountpoints. */
-                        ino = find_inode_number(de, &name);
-                        if (!ino) ino = vdir->d_fileno;
                        type = CDT2DT(vdir->d_type);
-                        ret = filldir(buf, name.name, name.len,
+                        if (!dir_emit(ctx, name.name, name.len, ino, type))
-                                      coda_file->f_pos, ino, type);
+                                break;
-                        /* failure means no space for filling in this round */
-                        if (ret < 0) break;
-                        result++;
                }
                /* we'll always have progress because d_reclen is unsigned and
                 * we've already established it is non-zero. */
-                coda_file->f_pos += vdir->d_reclen;
+                ctx->pos += vdir->d_reclen;
        }
 out:
        kfree(vdir);
-        return result ? result : ret;
+        return 0;
 }
 /* called when a cache lookup succeeds */
@@ -560,7 +526,7 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
        if (cii->c_flags & C_FLUSH) 
                coda_flag_inode_children(inode, C_FLUSH);
-        if (de->d_count > 1)
+        if (d_count(de) > 1)
                /* pretend it's valid, but don't change the flags */
                goto out;
diff --git a/fs/compat.c b/fs/compat.c
index fc3b55dce184..6af20de2c1a3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -832,6 +832,7 @@ struct compat_old_linux_dirent {
 };
 struct compat_readdir_callback {
+        struct dir_context ctx;
        struct compat_old_linux_dirent __user *dirent;
        int result;
 };
@@ -873,15 +874,15 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
 {
        int error;
        struct fd f = fdget(fd);
-        struct compat_readdir_callback buf;
+        struct compat_readdir_callback buf = {
+                .ctx.actor = compat_fillonedir,
+                .dirent = dirent
+        };
        if (!f.file)
                return -EBADF;
-        buf.result = 0;
+        error = iterate_dir(f.file, &buf.ctx);
-        buf.dirent = dirent;
-        error = vfs_readdir(f.file, compat_fillonedir, &buf);
        if (buf.result)
                error = buf.result;
@@ -897,6 +898,7 @@ struct compat_linux_dirent {
 };
 struct compat_getdents_callback {
+        struct dir_context ctx;
        struct compat_linux_dirent __user *current_dir;
        struct compat_linux_dirent __user *previous;
        int count;
@@ -951,7 +953,11 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
 {
        struct fd f;
        struct compat_linux_dirent __user * lastdirent;
-        struct compat_getdents_callback buf;
+        struct compat_getdents_callback buf = {
+                .ctx.actor = compat_filldir,
+                .current_dir = dirent,
+                .count = count
+        };
        int error;
        if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -961,17 +967,12 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
        if (!f.file)
                return -EBADF;
-        buf.current_dir = dirent;
+        error = iterate_dir(f.file, &buf.ctx);
-        buf.previous = NULL;
-        buf.count = count;
-        buf.error = 0;
-        error = vfs_readdir(f.file, compat_filldir, &buf);
        if (error >= 0)
                error = buf.error;
        lastdirent = buf.previous;
        if (lastdirent) {
-                if (put_user(f.file->f_pos, &lastdirent->d_off))
+                if (put_user(buf.ctx.pos, &lastdirent->d_off))
                        error = -EFAULT;
                else
                        error = count - buf.count;
@@ -983,6 +984,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
 #ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64
 struct compat_getdents_callback64 {
+        struct dir_context ctx;
        struct linux_dirent64 __user *current_dir;
        struct linux_dirent64 __user *previous;
        int count;
@@ -1036,7 +1038,11 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
 {
        struct fd f;
        struct linux_dirent64 __user * lastdirent;
-        struct compat_getdents_callback64 buf;
+        struct compat_getdents_callback64 buf = {
+                .ctx.actor = compat_filldir64,
+                .current_dir = dirent,
+                .count = count
+        };
        int error;
        if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -1046,17 +1052,12 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
        if (!f.file)
                return -EBADF;
-        buf.current_dir = dirent;
+        error = iterate_dir(f.file, &buf.ctx);
-        buf.previous = NULL;
-        buf.count = count;
-        buf.error = 0;
-        error = vfs_readdir(f.file, compat_filldir64, &buf);
        if (error >= 0)
                error = buf.error;
        lastdirent = buf.previous;
        if (lastdirent) {
-                typeof(lastdirent->d_off) d_off = f.file->f_pos;
+                typeof(lastdirent->d_off) d_off = buf.ctx.pos;
                if (__put_user_unaligned(d_off, &lastdirent->d_off))
                        error = -EFAULT;
                else
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 996cdc5abb85..5d19acfa7c6c 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -66,7 +66,6 @@
 #include <linux/gigaset_dev.h>
 #ifdef CONFIG_BLOCK
-#include <linux/loop.h>
 #include <linux/cdrom.h>
 #include <linux/fd.h>
 #include <scsi/scsi.h>
@@ -954,8 +953,6 @@ COMPATIBLE_IOCTL(MTIOCTOP)
 /* Socket level stuff */
 COMPATIBLE_IOCTL(FIOQSIZE)
 #ifdef CONFIG_BLOCK
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
 /* md calls this on random blockdevs */
 IGNORE_IOCTL(RAID_VERSION)
 /* qemu/qemu-img might call these two on plain files for probing */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7aabc6ad4e9b..277bd1be21fd 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -387,7 +387,7 @@ static void remove_dir(struct dentry * d)
        if (d->d_inode)
                simple_rmdir(parent->d_inode,d);
-        pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
+        pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d));
        dput(parent);
 }
@@ -660,19 +660,15 @@ static int create_default_group(struct config_group *parent_group,
                                struct config_group *group)
 {
        int ret;
-        struct qstr name;
        struct configfs_dirent *sd;
        /* We trust the caller holds a reference to parent */
        struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
        if (!group->cg_item.ci_name)
                group->cg_item.ci_name = group->cg_item.ci_namebuf;
-        name.name = group->cg_item.ci_name;
-        name.len = strlen(name.name);
-        name.hash = full_name_hash(name.name, name.len);
        ret = -ENOMEM;
-        child = d_alloc(parent, &name);
+        child = d_alloc_name(parent, group->cg_item.ci_name);
        if (child) {
                d_add(child, NULL);
@@ -1532,84 +1528,66 @@ static inline unsigned char dt_type(struct configfs_dirent *sd)
        return (sd->s_mode >> 12) & 15;
 }
-static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int configfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct dentry *dentry = file->f_path.dentry;
        struct super_block *sb = dentry->d_sb;
        struct configfs_dirent * parent_sd = dentry->d_fsdata;
-        struct configfs_dirent *cursor = filp->private_data;
+        struct configfs_dirent *cursor = file->private_data;
        struct list_head *p, *q = &cursor->s_sibling;
        ino_t ino = 0;
-        int i = filp->f_pos;
-        switch (i) {
+        if (!dir_emit_dots(file, ctx))
-                case 0:
+                return 0;
-                        ino = dentry->d_inode->i_ino;
+        if (ctx->pos == 2) {
-                        if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+                spin_lock(&configfs_dirent_lock);
-                                break;
+                list_move(q, &parent_sd->s_children);
-                        filp->f_pos++;
+                spin_unlock(&configfs_dirent_lock);
-                        i++;
+        }
-                        /* fallthrough */
+        for (p = q->next; p != &parent_sd->s_children; p = p->next) {
-                case 1:
+                struct configfs_dirent *next;
-                        ino = parent_ino(dentry);
+                const char *name;
-                        if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+                int len;
-                                break;
+                struct inode *inode = NULL;
-                        filp->f_pos++;
-                        i++;
+                next = list_entry(p, struct configfs_dirent, s_sibling);
-                        /* fallthrough */
+                if (!next->s_element)
-                default:
+                        continue;
-                        if (filp->f_pos == 2) {
-                                spin_lock(&configfs_dirent_lock);
-                                list_move(q, &parent_sd->s_children);
-                                spin_unlock(&configfs_dirent_lock);
-                        }
-                        for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
-                                struct configfs_dirent *next;
-                                const char * name;
-                                int len;
-                                struct inode *inode = NULL;
-                                next = list_entry(p, struct configfs_dirent,
+                name = configfs_get_name(next);
-                                                   s_sibling);
+                len = strlen(name);
-                                if (!next->s_element)
-                                        continue;
-                                name = configfs_get_name(next);
-                                len = strlen(name);
-                                /*
-                                 * We'll have a dentry and an inode for
-                                 * PINNED items and for open attribute
-                                 * files.  We lock here to prevent a race
-                                 * with configfs_d_iput() clearing
-                                 * s_dentry before calling iput().
-                                 *
-                                 * Why do we go to the trouble?  If
-                                 * someone has an attribute file open,
-                                 * the inode number should match until
-                                 * they close it.  Beyond that, we don't
-                                 * care.
-                                 */
-                                spin_lock(&configfs_dirent_lock);
-                                dentry = next->s_dentry;
-                                if (dentry)
-                                        inode = dentry->d_inode;
-                                if (inode)
-                                        ino = inode->i_ino;
-                                spin_unlock(&configfs_dirent_lock);
-                                if (!inode)
-                                        ino = iunique(sb, 2);
-                                if (filldir(dirent, name, len, filp->f_pos, ino,
+                /*
-                                                 dt_type(next)) < 0)
+                 * We'll have a dentry and an inode for
-                                        return 0;
+                 * PINNED items and for open attribute
+                 * files.  We lock here to prevent a race
+                 * with configfs_d_iput() clearing
+                 * s_dentry before calling iput().
+                 *
+                 * Why do we go to the trouble?  If
+                 * someone has an attribute file open,
+                 * the inode number should match until
+                 * they close it.  Beyond that, we don't
+                 * care.
+                 */
+                spin_lock(&configfs_dirent_lock);
+                dentry = next->s_dentry;
+                if (dentry)
+                        inode = dentry->d_inode;
+                if (inode)
+                        ino = inode->i_ino;
+                spin_unlock(&configfs_dirent_lock);
+                if (!inode)
+                        ino = iunique(sb, 2);
-                                spin_lock(&configfs_dirent_lock);
+                if (!dir_emit(ctx, name, len, ino, dt_type(next)))
-                                list_move(q, p);
+                        return 0;
-                                spin_unlock(&configfs_dirent_lock);
-                                p = q;
+                spin_lock(&configfs_dirent_lock);
-                                filp->f_pos++;
+                list_move(q, p);
-                        }
+                spin_unlock(&configfs_dirent_lock);
+                p = q;
+                ctx->pos++;
        }
        return 0;
 }
@@ -1661,14 +1639,13 @@ const struct file_operations configfs_dir_operations = {
        .release        = configfs_dir_close,
        .llseek         = configfs_dir_lseek,
        .read           = generic_read_dir,
-        .readdir        = configfs_readdir,
+        .iterate        = configfs_readdir,
 };
 int configfs_register_subsystem(struct configfs_subsystem *subsys)
 {
        int err;
        struct config_group *group = &subsys->su_group;
-        struct qstr name;
        struct dentry *dentry;
        struct dentry *root;
        struct configfs_dirent *sd;
@@ -1685,12 +1662,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
        mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT);
-        name.name = group->cg_item.ci_name;
-        name.len = strlen(name.name);
-        name.hash = full_name_hash(name.name, name.len);
        err = -ENOMEM;
-        dentry = d_alloc(root, &name);
+        dentry = d_alloc_name(root, group->cg_item.ci_name);
        if (dentry) {
                d_add(dentry, NULL);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2b6cb23dd14e..1d1c41f1014d 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
        mutex_lock(&buffer->mutex);
        len = fill_write_buffer(buffer, buf, count);
        if (len > 0)
-                len = flush_write_buffer(file->f_path.dentry, buffer, count);
+                len = flush_write_buffer(file->f_path.dentry, buffer, len);
        if (len > 0)
                *ppos += len;
        mutex_unlock(&buffer->mutex);
diff --git a/fs/coredump.c b/fs/coredump.c
index dafafbafa731..72f816d6cad9 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -45,69 +45,79 @@
 #include <trace/events/sched.h>
 int core_uses_pid;
-char core_pattern[CORENAME_MAX_SIZE] = "core";
 unsigned int core_pipe_limit;
+char core_pattern[CORENAME_MAX_SIZE] = "core";
+static int core_name_size = CORENAME_MAX_SIZE;
 struct core_name {
        char *corename;
        int used, size;
 };
-static atomic_t call_count = ATOMIC_INIT(1);
 /* The maximal length of core_pattern is also specified in sysctl.c */
-static int expand_corename(struct core_name *cn)
+static int expand_corename(struct core_name *cn, int size)
 {
-        char *old_corename = cn->corename;
+        char *corename = krealloc(cn->corename, size, GFP_KERNEL);
-        cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
-        cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
-        if (!cn->corename) {
+        if (!corename)
-                kfree(old_corename);
                return -ENOMEM;
-        }
+        if (size > core_name_size) /* racy but harmless */
+                core_name_size = size;
+        cn->size = ksize(corename);
+        cn->corename = corename;
        return 0;
 }
+static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
+{
+        int free, need;
+again:
+        free = cn->size - cn->used;
+        need = vsnprintf(cn->corename + cn->used, free, fmt, arg);
+        if (need < free) {
+                cn->used += need;
+                return 0;
+        }
+        if (!expand_corename(cn, cn->size + need - free + 1))
+                goto again;
+        return -ENOMEM;
+}
 static int cn_printf(struct core_name *cn, const char *fmt, ...)
 {
-        char *cur;
-        int need;
-        int ret;
        va_list arg;
+        int ret;
        va_start(arg, fmt);
-        need = vsnprintf(NULL, 0, fmt, arg);
+        ret = cn_vprintf(cn, fmt, arg);
        va_end(arg);
-        if (likely(need < cn->size - cn->used - 1))
+        return ret;
-                goto out_printf;
+}
-        ret = expand_corename(cn);
+static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
-        if (ret)
+{
-                goto expand_fail;
+        int cur = cn->used;
+        va_list arg;
+        int ret;
-out_printf:
-        cur = cn->corename + cn->used;
        va_start(arg, fmt);
-        vsnprintf(cur, need + 1, fmt, arg);
+        ret = cn_vprintf(cn, fmt, arg);
        va_end(arg);
-        cn->used += need;
-        return 0;
-expand_fail:
+        for (; cur < cn->used; ++cur) {
+                if (cn->corename[cur] == '/')
+                        cn->corename[cur] = '!';
+        }
        return ret;
 }
-static void cn_escape(char *str)
-{
-        for (; *str; str++)
-                if (*str == '/')
-                        *str = '!';
-}
 static int cn_print_exe_file(struct core_name *cn)
 {
        struct file *exe_file;
@@ -115,12 +125,8 @@ static int cn_print_exe_file(struct core_name *cn)
        int ret;
        exe_file = get_mm_exe_file(current->mm);
-        if (!exe_file) {
+        if (!exe_file)
-                char *commstart = cn->corename + cn->used;
+                return cn_esc_printf(cn, "%s (path unknown)", current->comm);
-                ret = cn_printf(cn, "%s (path unknown)", current->comm);
-                cn_escape(commstart);
-                return ret;
-        }
        pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
        if (!pathbuf) {
@@ -134,9 +140,7 @@ static int cn_print_exe_file(struct core_name *cn)
                goto free_buf;
        }
-        cn_escape(path);
+        ret = cn_esc_printf(cn, "%s", path);
-        ret = cn_printf(cn, "%s", path);
 free_buf:
        kfree(pathbuf);
@@ -157,19 +161,19 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
        int pid_in_pattern = 0;
        int err = 0;
-        cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
-        cn->corename = kmalloc(cn->size, GFP_KERNEL);
        cn->used = 0;
+        cn->corename = NULL;
-        if (!cn->corename)
+        if (expand_corename(cn, core_name_size))
                return -ENOMEM;
+        cn->corename[0] = '\0';
+        if (ispipe)
+                ++pat_ptr;
        /* Repeat as long as we have more pattern to process and more output
           space */
        while (*pat_ptr) {
                if (*pat_ptr != '%') {
-                        if (*pat_ptr == 0)
-                                goto out;
                        err = cn_printf(cn, "%c", *pat_ptr++);
                } else {
                        switch (*++pat_ptr) {
@@ -210,22 +214,16 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
                                break;
                        }
                        /* hostname */
-                        case 'h': {
+                        case 'h':
-                                char *namestart = cn->corename + cn->used;
                                down_read(&uts_sem);
-                                err = cn_printf(cn, "%s",
+                                err = cn_esc_printf(cn, "%s",
                                              utsname()->nodename);
                                up_read(&uts_sem);
-                                cn_escape(namestart);
                                break;
-                        }
                        /* executable */
-                        case 'e': {
+                        case 'e':
-                                char *commstart = cn->corename + cn->used;
+                                err = cn_esc_printf(cn, "%s", current->comm);
-                                err = cn_printf(cn, "%s", current->comm);
-                                cn_escape(commstart);
                                break;
-                        }
                        case 'E':
                                err = cn_print_exe_file(cn);
                                break;
@@ -244,6 +242,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
                        return err;
        }
+out:
        /* Backward compatibility with core_uses_pid:
         *
         * If core_pattern does not include a %p (as is the default)
@@ -254,7 +253,6 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
                if (err)
                        return err;
        }
-out:
        return ispipe;
 }
@@ -549,7 +547,7 @@ void do_coredump(siginfo_t *siginfo)
                if (ispipe < 0) {
                        printk(KERN_WARNING "format_corename failed\n");
                        printk(KERN_WARNING "Aborting core\n");
-                        goto fail_corename;
+                        goto fail_unlock;
                }
                if (cprm.limit == 1) {
@@ -584,7 +582,7 @@ void do_coredump(siginfo_t *siginfo)
                        goto fail_dropcount;
                }
-                helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
+                helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL);
                if (!helper_argv) {
                        printk(KERN_WARNING "%s failed to allocate memory\n",
                               __func__);
@@ -601,7 +599,7 @@ void do_coredump(siginfo_t *siginfo)
                argv_free(helper_argv);
                if (retval) {
-                        printk(KERN_INFO "Core dump to %s pipe failed\n",
+                        printk(KERN_INFO "Core dump to |%s pipe failed\n",
                               cn.corename);
                        goto close_fail;
                }
@@ -669,7 +667,6 @@ fail_dropcount:
                atomic_dec(&core_dump_count);
 fail_unlock:
        kfree(cn.corename);
-fail_corename:
        coredump_finish(mm, core_dumped);
        revert_creds(old_cred);
 fail_creds:
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 35b1c7bd18b7..e501ac3a49ff 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -349,18 +349,17 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 /*
 * Read a cramfs directory entry.
 */
-static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int cramfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        char *buf;
        unsigned int offset;
-        int copied;
        /* Offset within the thing. */
-        offset = filp->f_pos;
+        if (ctx->pos >= inode->i_size)
-        if (offset >= inode->i_size)
                return 0;
+        offset = ctx->pos;
        /* Directory entries are always 4-byte aligned */
        if (offset & 3)
                return -EINVAL;
@@ -369,14 +368,13 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (!buf)
                return -ENOMEM;
-        copied = 0;
        while (offset < inode->i_size) {
                struct cramfs_inode *de;
                unsigned long nextoffset;
                char *name;
                ino_t ino;
                umode_t mode;
-                int namelen, error;
+                int namelen;
                mutex_lock(&read_mutex);
                de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
@@ -402,13 +400,10 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                break;
                        namelen--;
                }
-                error = filldir(dirent, buf, namelen, offset, ino, mode >> 12);
+                if (!dir_emit(ctx, buf, namelen, ino, mode >> 12))
-                if (error)
                        break;
-                offset = nextoffset;
+                ctx->pos = offset = nextoffset;
-                filp->f_pos = offset;
-                copied++;
        }
        kfree(buf);
        return 0;
@@ -547,7 +542,7 @@ static const struct address_space_operations cramfs_aops = {
 static const struct file_operations cramfs_directory_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = cramfs_readdir,
+        .iterate        = cramfs_readdir,
 };
 static const struct inode_operations cramfs_dir_inode_operations = {
diff --git a/fs/dcache.c b/fs/dcache.c
index f09b9085f7d8..87bdb5329c3c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1612,6 +1612,10 @@ EXPORT_SYMBOL(d_obtain_alias);
 * If a dentry was found and moved, then it is returned.  Otherwise NULL
 * is returned.  This matches the expected return value of ->lookup.
 *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
 */
 struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 {
@@ -1636,8 +1640,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
                        security_d_instantiate(dentry, inode);
                        d_rehash(dentry);
                }
-        } else
+        } else {
-                d_add(dentry, inode);
+                d_instantiate(dentry, inode);
+                if (d_unhashed(dentry))
+                        d_rehash(dentry);
+        }
        return new;
 }
 EXPORT_SYMBOL(d_splice_alias);
@@ -1723,7 +1730,7 @@ EXPORT_SYMBOL(d_add_ci);
 * Do the slow-case of the dentry name compare.
 *
 * Unlike the dentry_cmp() function, we need to atomically
- * load the name, length and inode information, so that the
+ * load the name and length information, so that the
 * filesystem can rely on them, and can use the 'name' and
 * 'len' information without worrying about walking off the
 * end of memory etc.
@@ -1741,22 +1748,18 @@ enum slow_d_compare {
 static noinline enum slow_d_compare slow_dentry_cmp(
                const struct dentry *parent,
-                struct inode *inode,
                struct dentry *dentry,
                unsigned int seq,
                const struct qstr *name)
 {
        int tlen = dentry->d_name.len;
        const char *tname = dentry->d_name.name;
-        struct inode *i = dentry->d_inode;
        if (read_seqcount_retry(&dentry->d_seq, seq)) {
                cpu_relax();
                return D_COMP_SEQRETRY;
        }
-        if (parent->d_op->d_compare(parent, inode,
+        if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
-                                dentry, i,
-                                tlen, tname, name))
                return D_COMP_NOMATCH;
        return D_COMP_OK;
 }
@@ -1766,7 +1769,6 @@ static noinline enum slow_d_compare slow_dentry_cmp(
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * @seqp: returns d_seq value at the point where the dentry was found
- * @inode: returns dentry->d_inode when the inode was found valid.
 * Returns: dentry, or NULL
 *
 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
@@ -1793,7 +1795,7 @@ static noinline enum slow_d_compare slow_dentry_cmp(
 */
 struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                const struct qstr *name,
-                                unsigned *seqp, struct inode *inode)
+                                unsigned *seqp)
 {
        u64 hashlen = name->hash_len;
        const unsigned char *str = name->name;
@@ -1827,11 +1829,10 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
 seqretry:
                /*
                 * The dentry sequence count protects us from concurrent
-                 * renames, and thus protects inode, parent and name fields.
+                 * renames, and thus protects parent and name fields.
                 *
                 * The caller must perform a seqcount check in order
-                 * to do anything useful with the returned dentry,
+                 * to do anything useful with the returned dentry.
-                 * including using the 'd_inode' pointer.
                 *
                 * NOTE! We do a "raw" seqcount_begin here. That means that
                 * we don't wait for the sequence count to stabilize if it
@@ -1845,12 +1846,12 @@ seqretry:
                        continue;
                if (d_unhashed(dentry))
                        continue;
-                *seqp = seq;
                if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
                        if (dentry->d_name.hash != hashlen_hash(hashlen))
                                continue;
-                        switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) {
+                        *seqp = seq;
+                        switch (slow_dentry_cmp(parent, dentry, seq, name)) {
                        case D_COMP_OK:
                                return dentry;
                        case D_COMP_NOMATCH:
@@ -1862,6 +1863,7 @@ seqretry:
                if (dentry->d_name.hash_len != hashlen)
                        continue;
+                *seqp = seq;
                if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
                        return dentry;
        }
@@ -1959,9 +1961,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
                if (parent->d_flags & DCACHE_OP_COMPARE) {
                        int tlen = dentry->d_name.len;
                        const char *tname = dentry->d_name.name;
-                        if (parent->d_op->d_compare(parent, parent->d_inode,
+                        if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
-                                                dentry, dentry->d_inode,
-                                                tlen, tname, name))
                                goto next;
                } else {
                        if (dentry->d_name.len != len)
@@ -1998,7 +1998,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
         */
        name->hash = full_name_hash(name->name, name->len);
        if (dir->d_flags & DCACHE_OP_HASH) {
-                int err = dir->d_op->d_hash(dir, dir->d_inode, name);
+                int err = dir->d_op->d_hash(dir, name);
                if (unlikely(err < 0))
                        return ERR_PTR(err);
        }
@@ -2968,34 +2968,21 @@ rename_retry:
        goto again;
 }
-/**
+void d_tmpfile(struct dentry *dentry, struct inode *inode)
- * find_inode_number - check for dentry with name
- * @dir: directory to check
- * @name: Name to find.
- *
- * Check whether a dentry already exists for the given name,
- * and return the inode number if it has an inode. Otherwise
- * 0 is returned.
- *
- * This routine is used to post-process directory listings for
- * filesystems using synthetic inode numbers, and is necessary
- * to keep getcwd() working.
- */
- 
-ino_t find_inode_number(struct dentry *dir, struct qstr *name)
 {
-        struct dentry * dentry;
+        inode_dec_link_count(inode);
-        ino_t ino = 0;
+        BUG_ON(dentry->d_name.name != dentry->d_iname ||
+                !hlist_unhashed(&dentry->d_alias) ||
-        dentry = d_hash_and_lookup(dir, name);
+                !d_unlinked(dentry));
-        if (!IS_ERR_OR_NULL(dentry)) {
+        spin_lock(&dentry->d_parent->d_lock);
-                if (dentry->d_inode)
+        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-                        ino = dentry->d_inode->i_ino;
+        dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
-                dput(dentry);
+                                (unsigned long long)inode->i_ino);
-        }
+        spin_unlock(&dentry->d_lock);
-        return ino;
+        spin_unlock(&dentry->d_parent->d_lock);
+        d_instantiate(dentry, inode);
 }
-EXPORT_SYMBOL(find_inode_number);
+EXPORT_SYMBOL(d_tmpfile);
 static __initdata unsigned long dhash_entries;
 static int __init set_dhash_entries(char *str)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index c5ca6ae5a30c..63146295153b 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -21,6 +21,7 @@
 #include <linux/debugfs.h>
 #include <linux/io.h>
 #include <linux/slab.h>
+#include <linux/atomic.h>
 static ssize_t default_read_file(struct file *file, char __user *buf,
                                 size_t count, loff_t *ppos)
@@ -403,6 +404,47 @@ struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_size_t);
+static int debugfs_atomic_t_set(void *data, u64 val)
+{
+        atomic_set((atomic_t *)data, val);
+        return 0;
+}
+static int debugfs_atomic_t_get(void *data, u64 *val)
+{
+        *val = atomic_read((atomic_t *)data);
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+                        debugfs_atomic_t_set, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
+/**
+ * debugfs_create_atomic_t - create a debugfs file that is used to read and
+ * write an atomic_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
+                                 struct dentry *parent, atomic_t *value)
+{
+        /* if there are no write bits set, make read only */
+        if (!(mode & S_IWUGO))
+                return debugfs_create_file(name, mode, parent, value,
+                                        &fops_atomic_t_ro);
+        /* if there are no read bits set, make write only */
+        if (!(mode & S_IRUGO))
+                return debugfs_create_file(name, mode, parent, value,
+                                        &fops_atomic_t_wo);
+        return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
 static ssize_t read_file_bool(struct file *file, char __user *user_buf,
                              size_t count, loff_t *ppos)
@@ -431,6 +473,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
        if (copy_from_user(buf, user_buf, buf_size))
                return -EFAULT;
+        buf[buf_size] = '\0';
        if (strtobool(buf, &bv) == 0)
                *val = bv;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 4888cb3fdef7..c7c83ff0f752 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -533,8 +533,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
 */
 void debugfs_remove_recursive(struct dentry *dentry)
 {
-        struct dentry *child;
+        struct dentry *child, *next, *parent;
-        struct dentry *parent;
        if (IS_ERR_OR_NULL(dentry))
                return;
@@ -544,61 +543,37 @@ void debugfs_remove_recursive(struct dentry *dentry)
                return;
        parent = dentry;
+ down:
        mutex_lock(&parent->d_inode->i_mutex);
+        list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) {
+                if (!debugfs_positive(child))
+                        continue;
-        while (1) {
+                /* perhaps simple_empty(child) makes more sense */
-                /*
-                 * When all dentries under "parent" has been removed,
-                 * walk up the tree until we reach our starting point.
-                 */
-                if (list_empty(&parent->d_subdirs)) {
-                        mutex_unlock(&parent->d_inode->i_mutex);
-                        if (parent == dentry)
-                                break;
-                        parent = parent->d_parent;
-                        mutex_lock(&parent->d_inode->i_mutex);
-                }
-                child = list_entry(parent->d_subdirs.next, struct dentry,
-                                d_u.d_child);
- next_sibling:
-                /*
-                 * If "child" isn't empty, walk down the tree and
-                 * remove all its descendants first.
-                 */
                if (!list_empty(&child->d_subdirs)) {
                        mutex_unlock(&parent->d_inode->i_mutex);
                        parent = child;
-                        mutex_lock(&parent->d_inode->i_mutex);
+                        goto down;
-                        continue;
                }
-                __debugfs_remove(child, parent);
+ up:
-                if (parent->d_subdirs.next == &child->d_u.d_child) {
+                if (!__debugfs_remove(child, parent))
-                        /*
+                        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-                         * Try the next sibling.
-                         */
-                        if (child->d_u.d_child.next != &parent->d_subdirs) {
-                                child = list_entry(child->d_u.d_child.next,
-                                                   struct dentry,
-                                                   d_u.d_child);
-                                goto next_sibling;
-                        }
-                        /*
-                         * Avoid infinite loop if we fail to remove
-                         * one dentry.
-                         */
-                        mutex_unlock(&parent->d_inode->i_mutex);
-                        break;
-                }
-                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        }
-        parent = dentry->d_parent;
+        mutex_unlock(&parent->d_inode->i_mutex);
+        child = parent;
+        parent = parent->d_parent;
        mutex_lock(&parent->d_inode->i_mutex);
-        __debugfs_remove(dentry, parent);
+        if (child != dentry) {
+                next = list_entry(child->d_u.d_child.next, struct dentry,
+                                        d_u.d_child);
+                goto up;
+        }
+        if (!__debugfs_remove(child, parent))
+                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        mutex_unlock(&parent->d_inode->i_mutex);
-        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 7d58d5b112b5..76feb4b60fa6 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -138,8 +138,9 @@ static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
 static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
                                          const char *buf, size_t len)
 {
-        strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
+        strlcpy(dlm_config.ci_cluster_name, buf,
-        strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
+                                sizeof(dlm_config.ci_cluster_name));
+        strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
        return len;
 }
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 1b1146670c4b..e223a911a834 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2038,8 +2038,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
        b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
        if (b == 1) {
                int len = receive_extralen(ms);
-                if (len > DLM_RESNAME_MAXLEN)
+                if (len > r->res_ls->ls_lvblen)
-                        len = DLM_RESNAME_MAXLEN;
+                        len = r->res_ls->ls_lvblen;
                memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
                lkb->lkb_lvbseq = ms->m_lvbseq;
        }
@@ -3893,8 +3893,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
                if (!lkb->lkb_lvbptr)
                        return -ENOMEM;
                len = receive_extralen(ms);
-                if (len > DLM_RESNAME_MAXLEN)
+                if (len > ls->ls_lvblen)
-                        len = DLM_RESNAME_MAXLEN;
+                        len = ls->ls_lvblen;
                memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
        }
        return 0;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 3ca79d3253b9..88556dc0458e 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -883,17 +883,24 @@ int dlm_release_lockspace(void *lockspace, int force)
 void dlm_stop_lockspaces(void)
 {
        struct dlm_ls *ls;
+        int count;
 restart:
+        count = 0;
        spin_lock(&lslist_lock);
        list_for_each_entry(ls, &lslist, ls_list) {
-                if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
+                if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) {
+                        count++;
                        continue;
+                }
                spin_unlock(&lslist_lock);
                log_error(ls, "no userland control daemon, stopping lockspace");
                dlm_ls_stop(ls);
                goto restart;
        }
        spin_unlock(&lslist_lock);
+        if (count)
+                log_print("dlm user daemon left %d lockspaces", count);
 }
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d0ccd2fd79eb..d90909ec6aa6 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -52,7 +52,6 @@
 #include <linux/mutex.h>
 #include <linux/sctp.h>
 #include <linux/slab.h>
-#include <linux/sctp.h>
 #include <net/sctp/sctp.h>
 #include <net/ipv6.h>
@@ -126,6 +125,7 @@ struct connection {
        struct connection *othercon;
        struct work_struct rwork; /* Receive workqueue */
        struct work_struct swork; /* Send workqueue */
+        bool try_new_addr;
 };
 #define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -144,6 +144,7 @@ struct dlm_node_addr {
        struct list_head list;
        int nodeid;
        int addr_count;
+        int curr_addr_index;
        struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
 };
@@ -310,7 +311,7 @@ static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
 }
 static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
-                          struct sockaddr *sa_out)
+                          struct sockaddr *sa_out, bool try_new_addr)
 {
        struct sockaddr_storage sas;
        struct dlm_node_addr *na;
@@ -320,8 +321,16 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
        spin_lock(&dlm_node_addrs_spin);
        na = find_node_addr(nodeid);
-        if (na && na->addr_count)
+        if (na && na->addr_count) {
-                memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage));
+                if (try_new_addr) {
+                        na->curr_addr_index++;
+                        if (na->curr_addr_index == na->addr_count)
+                                na->curr_addr_index = 0;
+                }
+                memcpy(&sas, na->addr[na->curr_addr_index ],
+                        sizeof(struct sockaddr_storage));
+        }
        spin_unlock(&dlm_node_addrs_spin);
        if (!na)
@@ -353,19 +362,22 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
 {
        struct dlm_node_addr *na;
        int rv = -EEXIST;
+        int addr_i;
        spin_lock(&dlm_node_addrs_spin);
        list_for_each_entry(na, &dlm_node_addrs, list) {
                if (!na->addr_count)
                        continue;
-                if (!addr_compare(na->addr[0], addr))
+                for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
-                        continue;
+                        if (addr_compare(na->addr[addr_i], addr)) {
+                                *nodeid = na->nodeid;
-                *nodeid = na->nodeid;
+                                rv = 0;
-                rv = 0;
+                                goto unlock;
-                break;
+                        }
+                }
        }
+unlock:
        spin_unlock(&dlm_node_addrs_spin);
        return rv;
 }
@@ -561,8 +573,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd)
 static void sctp_init_failed_foreach(struct connection *con)
 {
+        /*
+         * Don't try to recover base con and handle race where the
+         * other node's assoc init creates a assoc and we get that
+         * notification, then we get a notification that our attempt
+         * failed due. This happens when we are still trying the primary
+         * address, but the other node has already tried secondary addrs
+         * and found one that worked.
+         */
+        if (!con->nodeid || con->sctp_assoc)
+                return;
+        log_print("Retrying SCTP association init for node %d\n", con->nodeid);
+        con->try_new_addr = true;
        con->sctp_assoc = 0;
-        if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+        if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) {
                if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
                        queue_work(send_workqueue, &con->swork);
        }
@@ -579,15 +606,56 @@ static void sctp_init_failed(void)
        mutex_unlock(&connections_lock);
 }
+static void retry_failed_sctp_send(struct connection *recv_con,
+                                   struct sctp_send_failed *sn_send_failed,
+                                   char *buf)
+{
+        int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed);
+        struct dlm_mhandle *mh;
+        struct connection *con;
+        char *retry_buf;
+        int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
+        log_print("Retry sending %d bytes to node id %d", len, nodeid);
+        con = nodeid2con(nodeid, 0);
+        if (!con) {
+                log_print("Could not look up con for nodeid %d\n",
+                          nodeid);
+                return;
+        }
+        mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf);
+        if (!mh) {
+                log_print("Could not allocate buf for retry.");
+                return;
+        }
+        memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len);
+        dlm_lowcomms_commit_buffer(mh);
+        /*
+         * If we got a assoc changed event before the send failed event then
+         * we only need to retry the send.
+         */
+        if (con->sctp_assoc) {
+                if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+                        queue_work(send_workqueue, &con->swork);
+        } else
+                sctp_init_failed_foreach(con);
+}
 /* Something happened to an association */
 static void process_sctp_notification(struct connection *con,
                                      struct msghdr *msg, char *buf)
 {
        union sctp_notification *sn = (union sctp_notification *)buf;
-        if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+        switch (sn->sn_header.sn_type) {
+        case SCTP_SEND_FAILED:
+                retry_failed_sctp_send(con, &sn->sn_send_failed, buf);
+                break;
+        case SCTP_ASSOC_CHANGE:
                switch (sn->sn_assoc_change.sac_state) {
                case SCTP_COMM_UP:
                case SCTP_RESTART:
                {
@@ -662,9 +730,11 @@ static void process_sctp_notification(struct connection *con,
                        log_print("connecting to %d sctp association %d",
                                 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
+                        new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id;
+                        new_con->try_new_addr = false;
                        /* Send any pending writes */
                        clear_bit(CF_CONNECT_PENDING, &new_con->flags);
-                        clear_bit(CF_INIT_PENDING, &con->flags);
+                        clear_bit(CF_INIT_PENDING, &new_con->flags);
                        if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
                                queue_work(send_workqueue, &new_con->swork);
                        }
@@ -683,14 +753,10 @@ static void process_sctp_notification(struct connection *con,
                }
                break;
-                /* We don't know which INIT failed, so clear the PENDING flags
-                 * on them all.  if assoc_id is zero then it will then try
-                 * again */
                case SCTP_CANT_STR_ASSOC:
                {
+                        /* Will retry init when we get the send failed notification */
                        log_print("Can't start SCTP association - retrying");
-                        sctp_init_failed();
                }
                break;
@@ -699,6 +765,8 @@ static void process_sctp_notification(struct connection *con,
                                  (int)sn->sn_assoc_change.sac_assoc_id,
                                  sn->sn_assoc_change.sac_state);
                }
+        default:
+                ; /* fall through */
        }
 }
@@ -958,6 +1026,24 @@ static void free_entry(struct writequeue_entry *e)
        kfree(e);
 }
+/*
+ * writequeue_entry_complete - try to delete and free write queue entry
+ * @e: write queue entry to try to delete
+ * @completed: bytes completed
+ *
+ * writequeue_lock must be held.
+ */
+static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
+{
+        e->offset += completed;
+        e->len -= completed;
+        if (e->len == 0 && e->users == 0) {
+                list_del(&e->list);
+                free_entry(e);
+        }
+}
 /* Initiate an SCTP association.
   This is a special case of send_to_sock() in that we don't yet have a
   peeled-off socket for this association, so we use the listening socket
@@ -977,15 +1063,14 @@ static void sctp_init_assoc(struct connection *con)
        int addrlen;
        struct kvec iov[1];
+        mutex_lock(&con->sock_mutex);
        if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
-                return;
+                goto unlock;
-        if (con->retries++ > MAX_CONNECT_RETRIES)
-                return;
-        if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) {
+        if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr,
+                           con->try_new_addr)) {
                log_print("no address for nodeid %d", con->nodeid);
-                return;
+                goto unlock;
        }
        base_con = nodeid2con(0, 0);
        BUG_ON(base_con == NULL);
@@ -1003,17 +1088,25 @@ static void sctp_init_assoc(struct connection *con)
        if (list_empty(&con->writequeue)) {
                spin_unlock(&con->writequeue_lock);
                log_print("writequeue empty for nodeid %d", con->nodeid);
-                return;
+                goto unlock;
        }
        e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
        len = e->len;
        offset = e->offset;
-        spin_unlock(&con->writequeue_lock);
        /* Send the first block off the write queue */
        iov[0].iov_base = page_address(e->page)+offset;
        iov[0].iov_len = len;
+        spin_unlock(&con->writequeue_lock);
+        if (rem_addr.ss_family == AF_INET) {
+                struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr;
+                log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr);
+        } else {
+                struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr;
+                log_print("Trying to connect to %pI6", &sin6->sin6_addr);
+        }
        cmsg = CMSG_FIRSTHDR(&outmessage);
        cmsg->cmsg_level = IPPROTO_SCTP;
@@ -1021,8 +1114,9 @@ static void sctp_init_assoc(struct connection *con)
        cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
        sinfo = CMSG_DATA(cmsg);
        memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-        sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
+        sinfo->sinfo_ppid = cpu_to_le32(con->nodeid);
        outmessage.msg_controllen = cmsg->cmsg_len;
+        sinfo->sinfo_flags |= SCTP_ADDR_OVER;
        ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
        if (ret < 0) {
@@ -1035,15 +1129,12 @@ static void sctp_init_assoc(struct connection *con)
        }
        else {
                spin_lock(&con->writequeue_lock);
-                e->offset += ret;
+                writequeue_entry_complete(e, ret);
-                e->len -= ret;
-                if (e->len == 0 && e->users == 0) {
-                        list_del(&e->list);
-                        free_entry(e);
-                }
                spin_unlock(&con->writequeue_lock);
        }
+unlock:
+        mutex_unlock(&con->sock_mutex);
 }
 /* Connect a new socket to its peer */
@@ -1075,7 +1166,7 @@ static void tcp_connect_to_sock(struct connection *con)
                goto out_err;
        memset(&saddr, 0, sizeof(saddr));
-        result = nodeid_to_addr(con->nodeid, &saddr, NULL);
+        result = nodeid_to_addr(con->nodeid, &saddr, NULL, false);
        if (result < 0) {
                log_print("no address for nodeid %d", con->nodeid);
                goto out_err;
@@ -1254,6 +1345,7 @@ static int sctp_listen_for_all(void)
        int result = -EINVAL, num = 1, i, addr_len;
        struct connection *con = nodeid2con(0, GFP_NOFS);
        int bufsize = NEEDED_RMEM;
+        int one = 1;
        if (!con)
                return -ENOMEM;
@@ -1288,6 +1380,11 @@ static int sctp_listen_for_all(void)
                goto create_delsock;
        }
+        result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
+                                   sizeof(one));
+        if (result < 0)
+                log_print("Could not set SCTP NODELAY error %d\n", result);
        /* Init con struct */
        sock->sk->sk_user_data = con;
        con->sock = sock;
@@ -1493,13 +1590,7 @@ static void send_to_sock(struct connection *con)
                }
                spin_lock(&con->writequeue_lock);
-                e->offset += ret;
+                writequeue_entry_complete(e, ret);
-                e->len -= ret;
-                if (e->len == 0 && e->users == 0) {
-                        list_del(&e->list);
-                        free_entry(e);
-                }
        }
        spin_unlock(&con->writequeue_lock);
 out:
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 911649a47dd5..812149119fa3 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -686,7 +686,6 @@ static int device_close(struct inode *inode, struct file *file)
           device_remove_lockspace() */
        sigprocmask(SIG_SETMASK, &tmpsig, NULL);
-        recalc_sigpending();
        return 0;
 }
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index f71ec125290d..d10757635b9c 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -37,16 +37,8 @@
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
-static int
+#define DECRYPT         0
-ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+#define ENCRYPT         1
-                             struct page *dst_page, int dst_offset,
-                             struct page *src_page, int src_offset, int size,
-                             unsigned char *iv);
-static int
-ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
-                             struct page *dst_page, int dst_offset,
-                             struct page *src_page, int src_offset, int size,
-                             unsigned char *iv);
 /**
 * ecryptfs_to_hex
@@ -336,19 +328,20 @@ static void extent_crypt_complete(struct crypto_async_request *req, int rc)
 }
 /**
- * encrypt_scatterlist
+ * crypt_scatterlist
 * @crypt_stat: Pointer to the crypt_stat struct to initialize.
- * @dest_sg: Destination of encrypted data
+ * @dst_sg: Destination of the data after performing the crypto operation
- * @src_sg: Data to be encrypted
+ * @src_sg: Data to be encrypted or decrypted
- * @size: Length of data to be encrypted
+ * @size: Length of data
- * @iv: iv to use during encryption
+ * @iv: IV to use
+ * @op: ENCRYPT or DECRYPT to indicate the desired operation
 *
- * Returns the number of bytes encrypted; negative value on error
+ * Returns the number of bytes encrypted or decrypted; negative value on error
 */
-static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
+static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
-                               struct scatterlist *dest_sg,
+                             struct scatterlist *dst_sg,
-                               struct scatterlist *src_sg, int size,
+                             struct scatterlist *src_sg, int size,
-                               unsigned char *iv)
+                             unsigned char *iv, int op)
 {
        struct ablkcipher_request *req = NULL;
        struct extent_crypt_result ecr;
@@ -391,9 +384,9 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
                crypt_stat->flags |= ECRYPTFS_KEY_SET;
        }
        mutex_unlock(&crypt_stat->cs_tfm_mutex);
-        ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size);
+        ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
-        ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv);
+        rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) :
-        rc = crypto_ablkcipher_encrypt(req);
+                             crypto_ablkcipher_decrypt(req);
        if (rc == -EINPROGRESS || rc == -EBUSY) {
                struct extent_crypt_result *ecr = req->base.data;
@@ -407,41 +400,43 @@ out:
 }
 /**
- * ecryptfs_lower_offset_for_extent
+ * lower_offset_for_page
 *
 * Convert an eCryptfs page index into a lower byte offset
 */
-static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
+static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
-                                             struct ecryptfs_crypt_stat *crypt_stat)
+                                    struct page *page)
 {
-        (*offset) = ecryptfs_lower_header_size(crypt_stat)
+        return ecryptfs_lower_header_size(crypt_stat) +
-                    + (crypt_stat->extent_size * extent_num);
+               (page->index << PAGE_CACHE_SHIFT);
 }
 /**
- * ecryptfs_encrypt_extent
+ * crypt_extent
- * @enc_extent_page: Allocated page into which to encrypt the data in
- *                   @page
 * @crypt_stat: crypt_stat containing cryptographic context for the
 *              encryption operation
- * @page: Page containing plaintext data extent to encrypt
+ * @dst_page: The page to write the result into
+ * @src_page: The page to read from
 * @extent_offset: Page extent offset for use in generating IV
+ * @op: ENCRYPT or DECRYPT to indicate the desired operation
 *
- * Encrypts one extent of data.
+ * Encrypts or decrypts one extent of data.
 *
 * Return zero on success; non-zero otherwise
 */
-static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
+static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
-                                   struct ecryptfs_crypt_stat *crypt_stat,
+                        struct page *dst_page,
-                                   struct page *page,
+                        struct page *src_page,
-                                   unsigned long extent_offset)
+                        unsigned long extent_offset, int op)
 {
+        pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index;
        loff_t extent_base;
        char extent_iv[ECRYPTFS_MAX_IV_BYTES];
+        struct scatterlist src_sg, dst_sg;
+        size_t extent_size = crypt_stat->extent_size;
        int rc;
-        extent_base = (((loff_t)page->index)
+        extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size));
-                       * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
        rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
                                (extent_base + extent_offset));
        if (rc) {
@@ -450,15 +445,21 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
                        (unsigned long long)(extent_base + extent_offset), rc);
                goto out;
        }
-        rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0,
-                                          page, (extent_offset
+        sg_init_table(&src_sg, 1);
-                                                 * crypt_stat->extent_size),
+        sg_init_table(&dst_sg, 1);
-                                          crypt_stat->extent_size, extent_iv);
+        sg_set_page(&src_sg, src_page, extent_size,
+                    extent_offset * extent_size);
+        sg_set_page(&dst_sg, dst_page, extent_size,
+                    extent_offset * extent_size);
+        rc = crypt_scatterlist(crypt_stat, &dst_sg, &src_sg, extent_size,
+                               extent_iv, op);
        if (rc < 0) {
-                printk(KERN_ERR "%s: Error attempting to encrypt page with "
+                printk(KERN_ERR "%s: Error attempting to crypt page with "
-                       "page->index = [%ld], extent_offset = [%ld]; "
+                       "page_index = [%ld], extent_offset = [%ld]; "
-                       "rc = [%d]\n", __func__, page->index, extent_offset,
+                       "rc = [%d]\n", __func__, page_index, extent_offset, rc);
-                       rc);
                goto out;
        }
        rc = 0;
@@ -489,6 +490,7 @@ int ecryptfs_encrypt_page(struct page *page)
        char *enc_extent_virt;
        struct page *enc_extent_page = NULL;
        loff_t extent_offset;
+        loff_t lower_offset;
        int rc = 0;
        ecryptfs_inode = page->mapping->host;
@@ -502,75 +504,35 @@ int ecryptfs_encrypt_page(struct page *page)
                                "encrypted extent\n");
                goto out;
        }
-        enc_extent_virt = kmap(enc_extent_page);
        for (extent_offset = 0;
             extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
             extent_offset++) {
-                loff_t offset;
+                rc = crypt_extent(crypt_stat, enc_extent_page, page,
+                                  extent_offset, ENCRYPT);
-                rc = ecryptfs_encrypt_extent(enc_extent_page, crypt_stat, page,
-                                             extent_offset);
                if (rc) {
                        printk(KERN_ERR "%s: Error encrypting extent; "
                               "rc = [%d]\n", __func__, rc);
                        goto out;
                }
-                ecryptfs_lower_offset_for_extent(
-                        &offset, ((((loff_t)page->index)
-                                   * (PAGE_CACHE_SIZE
-                                      / crypt_stat->extent_size))
-                                  + extent_offset), crypt_stat);
-                rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt,
-                                          offset, crypt_stat->extent_size);
-                if (rc < 0) {
-                        ecryptfs_printk(KERN_ERR, "Error attempting "
-                                        "to write lower page; rc = [%d]"
-                                        "\n", rc);
-                        goto out;
-                }
-        }
-        rc = 0;
-out:
-        if (enc_extent_page) {
-                kunmap(enc_extent_page);
-                __free_page(enc_extent_page);
        }
-        return rc;
-}
-static int ecryptfs_decrypt_extent(struct page *page,
+        lower_offset = lower_offset_for_page(crypt_stat, page);
-                                   struct ecryptfs_crypt_stat *crypt_stat,
+        enc_extent_virt = kmap(enc_extent_page);
-                                   struct page *enc_extent_page,
+        rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
-                                   unsigned long extent_offset)
+                                  PAGE_CACHE_SIZE);
-{
+        kunmap(enc_extent_page);
-        loff_t extent_base;
-        char extent_iv[ECRYPTFS_MAX_IV_BYTES];
-        int rc;
-        extent_base = (((loff_t)page->index)
-                       * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
-        rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
-                                (extent_base + extent_offset));
-        if (rc) {
-                ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
-                        "extent [0x%.16llx]; rc = [%d]\n",
-                        (unsigned long long)(extent_base + extent_offset), rc);
-                goto out;
-        }
-        rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
-                                          (extent_offset
-                                           * crypt_stat->extent_size),
-                                          enc_extent_page, 0,
-                                          crypt_stat->extent_size, extent_iv);
        if (rc < 0) {
-                printk(KERN_ERR "%s: Error attempting to decrypt to page with "
+                ecryptfs_printk(KERN_ERR,
-                       "page->index = [%ld], extent_offset = [%ld]; "
+                        "Error attempting to write lower page; rc = [%d]\n",
-                       "rc = [%d]\n", __func__, page->index, extent_offset,
+                        rc);
-                       rc);
                goto out;
        }
        rc = 0;
 out:
+        if (enc_extent_page) {
+                __free_page(enc_extent_page);
+        }
        return rc;
 }
@@ -594,43 +556,33 @@ int ecryptfs_decrypt_page(struct page *page)
 {
        struct inode *ecryptfs_inode;
        struct ecryptfs_crypt_stat *crypt_stat;
-        char *enc_extent_virt;
+        char *page_virt;
-        struct page *enc_extent_page = NULL;
        unsigned long extent_offset;
+        loff_t lower_offset;
        int rc = 0;
        ecryptfs_inode = page->mapping->host;
        crypt_stat =
                &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
        BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
-        enc_extent_page = alloc_page(GFP_USER);
-        if (!enc_extent_page) {
+        lower_offset = lower_offset_for_page(crypt_stat, page);
-                rc = -ENOMEM;
+        page_virt = kmap(page);
-                ecryptfs_printk(KERN_ERR, "Error allocating memory for "
+        rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE,
-                                "encrypted extent\n");
+                                 ecryptfs_inode);
+        kunmap(page);
+        if (rc < 0) {
+                ecryptfs_printk(KERN_ERR,
+                        "Error attempting to read lower page; rc = [%d]\n",
+                        rc);
                goto out;
        }
-        enc_extent_virt = kmap(enc_extent_page);
        for (extent_offset = 0;
             extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
             extent_offset++) {
-                loff_t offset;
+                rc = crypt_extent(crypt_stat, page, page,
+                                  extent_offset, DECRYPT);
-                ecryptfs_lower_offset_for_extent(
-                        &offset, ((page->index * (PAGE_CACHE_SIZE
-                                                  / crypt_stat->extent_size))
-                                  + extent_offset), crypt_stat);
-                rc = ecryptfs_read_lower(enc_extent_virt, offset,
-                                         crypt_stat->extent_size,
-                                         ecryptfs_inode);
-                if (rc < 0) {
-                        ecryptfs_printk(KERN_ERR, "Error attempting "
-                                        "to read lower page; rc = [%d]"
-                                        "\n", rc);
-                        goto out;
-                }
-                rc = ecryptfs_decrypt_extent(page, crypt_stat, enc_extent_page,
-                                             extent_offset);
                if (rc) {
                        printk(KERN_ERR "%s: Error encrypting extent; "
                               "rc = [%d]\n", __func__, rc);
@@ -638,142 +590,9 @@ int ecryptfs_decrypt_page(struct page *page)
                }
        }
 out:
-        if (enc_extent_page) {
-                kunmap(enc_extent_page);
-                __free_page(enc_extent_page);
-        }
        return rc;
 }
-/**
- * decrypt_scatterlist
- * @crypt_stat: Cryptographic context
- * @dest_sg: The destination scatterlist to decrypt into
- * @src_sg: The source scatterlist to decrypt from
- * @size: The number of bytes to decrypt
- * @iv: The initialization vector to use for the decryption
- *
- * Returns the number of bytes decrypted; negative value on error
- */
-static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
-                               struct scatterlist *dest_sg,
-                               struct scatterlist *src_sg, int size,
-                               unsigned char *iv)
-{
-        struct ablkcipher_request *req = NULL;
-        struct extent_crypt_result ecr;
-        int rc = 0;
-        BUG_ON(!crypt_stat || !crypt_stat->tfm
-               || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
-        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
-                                crypt_stat->key_size);
-                ecryptfs_dump_hex(crypt_stat->key,
-                                  crypt_stat->key_size);
-        }
-        init_completion(&ecr.completion);
-        mutex_lock(&crypt_stat->cs_tfm_mutex);
-        req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
-        if (!req) {
-                mutex_unlock(&crypt_stat->cs_tfm_mutex);
-                rc = -ENOMEM;
-                goto out;
-        }
-        ablkcipher_request_set_callback(req,
-                        CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-                        extent_crypt_complete, &ecr);
-        /* Consider doing this once, when the file is opened */
-        if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
-                rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
-                                              crypt_stat->key_size);
-                if (rc) {
-                        ecryptfs_printk(KERN_ERR,
-                                        "Error setting key; rc = [%d]\n",
-                                        rc);
-                        mutex_unlock(&crypt_stat->cs_tfm_mutex);
-                        rc = -EINVAL;
-                        goto out;
-                }
-                crypt_stat->flags |= ECRYPTFS_KEY_SET;
-        }
-        mutex_unlock(&crypt_stat->cs_tfm_mutex);
-        ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
-        ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv);
-        rc = crypto_ablkcipher_decrypt(req);
-        if (rc == -EINPROGRESS || rc == -EBUSY) {
-                struct extent_crypt_result *ecr = req->base.data;
-                wait_for_completion(&ecr->completion);
-                rc = ecr->rc;
-                INIT_COMPLETION(ecr->completion);
-        }
-out:
-        ablkcipher_request_free(req);
-        return rc;
-}
-/**
- * ecryptfs_encrypt_page_offset
- * @crypt_stat: The cryptographic context
- * @dst_page: The page to encrypt into
- * @dst_offset: The offset in the page to encrypt into
- * @src_page: The page to encrypt from
- * @src_offset: The offset in the page to encrypt from
- * @size: The number of bytes to encrypt
- * @iv: The initialization vector to use for the encryption
- *
- * Returns the number of bytes encrypted
- */
-static int
-ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
-                             struct page *dst_page, int dst_offset,
-                             struct page *src_page, int src_offset, int size,
-                             unsigned char *iv)
-{
-        struct scatterlist src_sg, dst_sg;
-        sg_init_table(&src_sg, 1);
-        sg_init_table(&dst_sg, 1);
-        sg_set_page(&src_sg, src_page, size, src_offset);
-        sg_set_page(&dst_sg, dst_page, size, dst_offset);
-        return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
-}
-/**
- * ecryptfs_decrypt_page_offset
- * @crypt_stat: The cryptographic context
- * @dst_page: The page to decrypt into
- * @dst_offset: The offset in the page to decrypt into
- * @src_page: The page to decrypt from
- * @src_offset: The offset in the page to decrypt from
- * @size: The number of bytes to decrypt
- * @iv: The initialization vector to use for the decryption
- *
- * Returns the number of bytes decrypted
- */
-static int
-ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
-                             struct page *dst_page, int dst_offset,
-                             struct page *src_page, int src_offset, int size,
-                             unsigned char *iv)
-{
-        struct scatterlist src_sg, dst_sg;
-        sg_init_table(&src_sg, 1);
-        sg_set_page(&src_sg, src_page, size, src_offset);
-        sg_init_table(&dst_sg, 1);
-        sg_set_page(&dst_sg, dst_page, size, dst_offset);
-        return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
-}
 #define ECRYPTFS_MAX_SCATTERLIST_LEN 4
 /**
@@ -2243,12 +2062,11 @@ out:
 */
 int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
                                         size_t *plaintext_name_size,
-                                         struct dentry *ecryptfs_dir_dentry,
+                                         struct super_block *sb,
                                         const char *name, size_t name_size)
 {
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
-                &ecryptfs_superblock_to_private(
+                &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
-                        ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
        char *decoded_name;
        size_t decoded_name_size;
        size_t packet_size;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index f622a733f7ad..df19d34a033b 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -575,7 +575,7 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
                             struct inode *ecryptfs_inode);
 int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
                                         size_t *decrypted_name_size,
-                                         struct dentry *ecryptfs_dentry,
+                                         struct super_block *sb,
                                         const char *name, size_t name_size);
 int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
 int ecryptfs_encrypt_and_encode_filename(
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index a7abbea2c096..992cf95830b5 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -49,7 +49,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
                                unsigned long nr_segs, loff_t pos)
 {
        ssize_t rc;
-        struct path lower;
+        struct path *path;
        struct file *file = iocb->ki_filp;
        rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
@@ -60,17 +60,16 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
        if (-EIOCBQUEUED == rc)
                rc = wait_on_sync_kiocb(iocb);
        if (rc >= 0) {
-                lower.dentry = ecryptfs_dentry_to_lower(file->f_path.dentry);
+                path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
-                lower.mnt = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry);
+                touch_atime(path);
-                touch_atime(&lower);
        }
        return rc;
 }
 struct ecryptfs_getdents_callback {
-        void *dirent;
+        struct dir_context ctx;
-        struct dentry *dentry;
+        struct dir_context *caller;
-        filldir_t filldir;
+        struct super_block *sb;
        int filldir_called;
        int entries_written;
 };
@@ -88,7 +87,7 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
        buf->filldir_called++;
        rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
-                                                  buf->dentry, lower_name,
+                                                  buf->sb, lower_name,
                                                  lower_namelen);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to decode and decrypt "
@@ -96,9 +95,10 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
                       rc);
                goto out;
        }
-        rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
+        buf->caller->pos = buf->ctx.pos;
+        rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
        kfree(name);
-        if (rc >= 0)
+        if (!rc)
                buf->entries_written++;
 out:
        return rc;
@@ -107,27 +107,22 @@ out:
 /**
 * ecryptfs_readdir
 * @file: The eCryptfs directory file
- * @dirent: Directory entry handle
+ * @ctx: The actor to feed the entries to
- * @filldir: The filldir callback function
 */
-static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
 {
        int rc;
        struct file *lower_file;
-        struct inode *inode;
+        struct inode *inode = file_inode(file);
-        struct ecryptfs_getdents_callback buf;
+        struct ecryptfs_getdents_callback buf = {
+                .ctx.actor = ecryptfs_filldir,
+                .caller = ctx,
+                .sb = inode->i_sb,
+        };
        lower_file = ecryptfs_file_to_lower(file);
-        lower_file->f_pos = file->f_pos;
+        lower_file->f_pos = ctx->pos;
-        inode = file_inode(file);
+        rc = iterate_dir(lower_file, &buf.ctx);
-        memset(&buf, 0, sizeof(buf));
+        ctx->pos = buf.ctx.pos;
-        buf.dirent = dirent;
-        buf.dentry = file->f_path.dentry;
-        buf.filldir = filldir;
-        buf.filldir_called = 0;
-        buf.entries_written = 0;
-        rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
-        file->f_pos = lower_file->f_pos;
        if (rc < 0)
                goto out;
        if (buf.filldir_called && !buf.entries_written)
@@ -344,7 +339,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 #endif
 const struct file_operations ecryptfs_dir_fops = {
-        .readdir = ecryptfs_readdir,
+        .iterate = ecryptfs_readdir,
        .read = generic_read_dir,
        .unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
@@ -365,7 +360,7 @@ const struct file_operations ecryptfs_main_fops = {
        .aio_read = ecryptfs_read_update_atime,
        .write = do_sync_write,
        .aio_write = generic_file_aio_write,
-        .readdir = ecryptfs_readdir,
+        .iterate = ecryptfs_readdir,
        .unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5eab400e2590..67e9b6339691 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -358,7 +358,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
        fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
-        BUG_ON(!lower_dentry->d_count);
+        BUG_ON(!d_count(lower_dentry));
        ecryptfs_set_dentry_private(dentry, dentry_info);
        ecryptfs_set_dentry_lower(dentry, lower_dentry);
@@ -679,7 +679,7 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
        set_fs(old_fs);
        if (rc < 0)
                goto out;
-        rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
+        rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb,
                                                  lower_buf, rc);
 out:
        kfree(lower_buf);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e924cf45aad9..eb1c5979ecaf 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -120,16 +120,15 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
                                    struct file **lower_file)
 {
        const struct cred *cred = current_cred();
-        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        struct path *path = ecryptfs_dentry_to_lower_path(dentry);
-        struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
        int rc;
-        rc = ecryptfs_privileged_open(lower_file, lower_dentry, lower_mnt,
+        rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
                                      cred);
        if (rc) {
                printk(KERN_ERR "Error opening lower file "
                       "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
-                       "rc = [%d]\n", lower_dentry, lower_mnt, rc);
+                       "rc = [%d]\n", path->dentry, path->mnt, rc);
                (*lower_file) = NULL;
        }
        return rc;
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 49ff8ea08f1c..e57380e5f6bd 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -247,14 +247,13 @@ int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
                goto unlock;
        }
        msg_size = (sizeof(*msg) + msg->data_len);
-        msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
+        msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL);
        if (!msg_ctx->msg) {
                rc = -ENOMEM;
                printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
                       "GFP_KERNEL memory\n", __func__, msg_size);
                goto unlock;
        }
-        memcpy(msg_ctx->msg, msg, msg_size);
        msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE;
        wake_up_process(msg_ctx->task);
        rc = 0;
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 7e787fb90293..07ab49745e31 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -155,20 +155,8 @@ static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
        return 0;
 };
-/*
- * Handle negative dentry.
- */
-static struct dentry *efivarfs_lookup(struct inode *dir, struct dentry *dentry,
-                                      unsigned int flags)
-{
-        if (dentry->d_name.len > NAME_MAX)
-                return ERR_PTR(-ENAMETOOLONG);
-        d_add(dentry, NULL);
-        return NULL;
-}
 const struct inode_operations efivarfs_dir_inode_operations = {
-        .lookup = efivarfs_lookup,
+        .lookup = simple_lookup,
        .unlink = efivarfs_unlink,
        .create = efivarfs_create,
 };
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 141aee31884f..a8766b880c07 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -45,8 +45,8 @@ static struct super_block *efivarfs_sb;
 * So we need to perform a case-sensitive match on part 1 and a
 * case-insensitive match on part 2.
 */
-static int efivarfs_d_compare(const struct dentry *parent, const struct inode *pinode,
+static int efivarfs_d_compare(const struct dentry *parent,
-                              const struct dentry *dentry, const struct inode *inode,
+                              const struct dentry *dentry,
                              unsigned int len, const char *str,
                              const struct qstr *name)
 {
@@ -63,8 +63,7 @@ static int efivarfs_d_compare(const struct dentry *parent, const struct inode *p
        return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN);
 }
-static int efivarfs_d_hash(const struct dentry *dentry,
+static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
-                           const struct inode *inode, struct qstr *qstr)
 {
        unsigned long hash = init_name_hash();
        const unsigned char *s = qstr->name;
@@ -108,7 +107,7 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
        q.name = name;
        q.len = strlen(name);
-        err = efivarfs_d_hash(NULL, NULL, &q);
+        err = efivarfs_d_hash(NULL, &q);
        if (err)
                return ERR_PTR(err);
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 055a9e9ca747..b72307ccdf7a 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -7,40 +7,38 @@
 #include <linux/buffer_head.h>
 #include "efs.h"
-static int efs_readdir(struct file *, void *, filldir_t);
+static int efs_readdir(struct file *, struct dir_context *);
 const struct file_operations efs_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = efs_readdir,
+        .iterate        = efs_readdir,
 };
 const struct inode_operations efs_dir_inode_operations = {
        .lookup         = efs_lookup,
 };
-static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
+static int efs_readdir(struct file *file, struct dir_context *ctx)
-        struct inode *inode = file_inode(filp);
+{
-        struct buffer_head *bh;
+        struct inode *inode = file_inode(file);
-        struct efs_dir          *dirblock;
-        struct efs_dentry       *dirslot;
-        efs_ino_t               inodenum;
        efs_block_t             block;
-        int                     slot, namelen;
+        int                     slot;
-        char                    *nameptr;
        if (inode->i_size & (EFS_DIRBSIZE-1))
                printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
        /* work out where this entry can be found */
-        block = filp->f_pos >> EFS_DIRBSIZE_BITS;
+        block = ctx->pos >> EFS_DIRBSIZE_BITS;
        /* each block contains at most 256 slots */
-        slot  = filp->f_pos & 0xff;
+        slot  = ctx->pos & 0xff;
        /* look at all blocks */
        while (block < inode->i_blocks) {
+                struct efs_dir          *dirblock;
+                struct buffer_head *bh;
                /* read the dir block */
                bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
@@ -57,11 +55,14 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
                        break;
                }
-                while (slot < dirblock->slots) {
+                for (; slot < dirblock->slots; slot++) {
-                        if (dirblock->space[slot] == 0) {
+                        struct efs_dentry *dirslot;
-                                slot++;
+                        efs_ino_t inodenum;
+                        const char *nameptr;
+                        int namelen;
+                        if (dirblock->space[slot] == 0)
                                continue;
-                        }
                        dirslot  = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
@@ -72,39 +73,29 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
 #ifdef DEBUG
                        printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen);
 #endif
-                        if (namelen > 0) {
+                        if (!namelen)
-                                /* found the next entry */
+                                continue;
-                                filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
+                        /* found the next entry */
+                        ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
-                                /* copy filename and data in dirslot */
-                                filldir(dirent, nameptr, namelen, filp->f_pos, inodenum, DT_UNKNOWN);
+                        /* sanity check */
+                        if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
-                                /* sanity check */
+                                printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
-                                if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
+                                continue;
-                                        printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
+                        }
-                                        slot++;
-                                        continue;
+                        /* copy filename and data in dirslot */
-                                }
+                        if (!dir_emit(ctx, nameptr, namelen, inodenum, DT_UNKNOWN)) {
-                                /* store position of next slot */
-                                if (++slot == dirblock->slots) {
-                                        slot = 0;
-                                        block++;
-                                }
                                brelse(bh);
-                                filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
+                                return 0;
-                                goto out;
                        }
-                        slot++;
                }
                brelse(bh);
                slot = 0;
                block++;
        }
+        ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
-        filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
-out:
        return 0;
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index deecc7294a67..9ad17b15b454 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,6 +34,7 @@
 #include <linux/mutex.h>
 #include <linux/anon_inodes.h>
 #include <linux/device.h>
+#include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/mman.h>
@@ -1602,7 +1603,8 @@ fetch_events:
                        }
                        spin_unlock_irqrestore(&ep->lock, flags);
-                        if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+                        if (!freezable_schedule_hrtimeout_range(to, slack,
+                                                                HRTIMER_MODE_ABS))
                                timed_out = 1;
                        spin_lock_irqsave(&ep->lock, flags);
@@ -1975,8 +1977,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
                        return -EINVAL;
                if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
                        return -EFAULT;
-                sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+                sigsaved = current->blocked;
-                sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+                set_current_blocked(&ksigmask);
        }
        error = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -1993,7 +1995,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
                               sizeof(sigsaved));
                        set_restore_sigmask();
                } else
-                        sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+                        set_current_blocked(&sigsaved);
        }
        return error;
@@ -2020,8 +2022,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
                if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
                        return -EFAULT;
                sigset_from_compat(&ksigmask, &csigmask);
-                sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+                sigsaved = current->blocked;
-                sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+                set_current_blocked(&ksigmask);
        }
        err = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -2038,7 +2040,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
                               sizeof(sigsaved));
                        set_restore_sigmask();
                } else
-                        sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+                        set_current_blocked(&sigsaved);
        }
        return err;
diff --git a/fs/exec.c b/fs/exec.c
index ffd7a813ad3d..9c73def87642 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -110,13 +110,14 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
        static const struct open_flags uselib_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
                .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
-                .intent = LOOKUP_OPEN
+                .intent = LOOKUP_OPEN,
+                .lookup_flags = LOOKUP_FOLLOW,
        };
        if (IS_ERR(tmp))
                goto out;
-        file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
+        file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
        putname(tmp);
        error = PTR_ERR(file);
        if (IS_ERR(file))
@@ -756,10 +757,11 @@ struct file *open_exec(const char *name)
        static const struct open_flags open_exec_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
                .acc_mode = MAY_EXEC | MAY_OPEN,
-                .intent = LOOKUP_OPEN
+                .intent = LOOKUP_OPEN,
+                .lookup_flags = LOOKUP_FOLLOW,
        };
-        file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags, LOOKUP_FOLLOW);
+        file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags);
        if (IS_ERR(file))
                goto out;
@@ -930,6 +932,7 @@ static int de_thread(struct task_struct *tsk)
                 * also take its birthdate (always earlier than our own).
                 */
                tsk->start_time = leader->start_time;
+                tsk->real_start_time = leader->real_start_time;
                BUG_ON(!same_thread_group(leader, tsk));
                BUG_ON(has_group_leader_pid(tsk));
@@ -945,9 +948,8 @@ static int de_thread(struct task_struct *tsk)
                 * Note: The old leader also uses this pid until release_task
                 *       is called.  Odd but simple and correct.
                 */
-                detach_pid(tsk, PIDTYPE_PID);
                tsk->pid = leader->pid;
-                attach_pid(tsk, PIDTYPE_PID,  task_pid(leader));
+                change_pid(tsk, PIDTYPE_PID, task_pid(leader));
                transfer_pid(leader, tsk, PIDTYPE_PGID);
                transfer_pid(leader, tsk, PIDTYPE_SID);
@@ -1463,7 +1465,6 @@ static int do_execve_common(const char *filename,
        struct files_struct *displaced;
        bool clear_in_exec;
        int retval;
-        const struct cred *cred = current_cred();
        /*
         * We move the actual failure in case of RLIMIT_NPROC excess from
@@ -1472,7 +1473,7 @@ static int do_execve_common(const char *filename,
         * whether NPROC limit is still exceeded.
         */
        if ((current->flags & PF_NPROC_EXCEEDED) &&
-            atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
+            atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
                retval = -EAGAIN;
                goto out_ret;
        }
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 46375896cfc0..49f51ab4caac 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -239,22 +239,19 @@ void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
 }
 static int
-exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+exofs_readdir(struct file *file, struct dir_context *ctx)
 {
-        loff_t pos = filp->f_pos;
+        loff_t pos = ctx->pos;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        unsigned int offset = pos & ~PAGE_CACHE_MASK;
        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned long npages = dir_pages(inode);
        unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
-        unsigned char *types = NULL;
+        int need_revalidate = (file->f_version != inode->i_version);
-        int need_revalidate = (filp->f_version != inode->i_version);
        if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
                return 0;
-        types = exofs_filetype_table;
        for ( ; n < npages; n++, offset = 0) {
                char *kaddr, *limit;
                struct exofs_dir_entry *de;
@@ -263,7 +260,7 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                if (IS_ERR(page)) {
                        EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
                                  inode->i_ino);
-                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        ctx->pos += PAGE_CACHE_SIZE - offset;
                        return PTR_ERR(page);
                }
                kaddr = page_address(page);
@@ -271,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        if (offset) {
                                offset = exofs_validate_entry(kaddr, offset,
                                                                chunk_mask);
-                                filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+                                ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
                        }
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
                        need_revalidate = 0;
                }
                de = (struct exofs_dir_entry *)(kaddr + offset);
@@ -288,27 +285,24 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                return -EIO;
                        }
                        if (de->inode_no) {
-                                int over;
+                                unsigned char t;
-                                unsigned char d_type = DT_UNKNOWN;
-                                if (types && de->file_type < EXOFS_FT_MAX)
+                                if (de->file_type < EXOFS_FT_MAX)
-                                        d_type = types[de->file_type];
+                                        t = exofs_filetype_table[de->file_type];
+                                else
+                                        t = DT_UNKNOWN;
-                                offset = (char *)de - kaddr;
+                                if (!dir_emit(ctx, de->name, de->name_len,
-                                over = filldir(dirent, de->name, de->name_len,
-                                                (n<<PAGE_CACHE_SHIFT) | offset,
                                                le64_to_cpu(de->inode_no),
-                                                d_type);
+                                                t)) {
-                                if (over) {
                                        exofs_put_page(page);
                                        return 0;
                                }
                        }
-                        filp->f_pos += le16_to_cpu(de->rec_len);
+                        ctx->pos += le16_to_cpu(de->rec_len);
                }
                exofs_put_page(page);
        }
        return 0;
 }
@@ -669,5 +663,5 @@ not_empty:
 const struct file_operations exofs_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = exofs_readdir,
+        .iterate        = exofs_readdir,
 };
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1f80abd8828..2ec8eb1ab269 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
        return 0;
 }
-static void exofs_invalidatepage(struct page *page, unsigned long offset)
+static void exofs_invalidatepage(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
-        EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
+        EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
+                     page->index, offset, length);
        WARN_ON(1);
 }
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 262fc9940982..293bc2e47a73 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -212,6 +212,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 }
 struct getdents_callback {
+        struct dir_context ctx;
        char *name;             /* name that was found. It already points to a
                                   buffer NAME_MAX+1 is size */
        unsigned long ino;      /* the inum we are looking for */
@@ -254,7 +255,11 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
        struct inode *dir = path->dentry->d_inode;
        int error;
        struct file *file;
-        struct getdents_callback buffer;
+        struct getdents_callback buffer = {
+                .ctx.actor = filldir_one,
+                .name = name,
+                .ino = child->d_inode->i_ino
+        };
        error = -ENOTDIR;
        if (!dir || !S_ISDIR(dir->i_mode))
@@ -271,17 +276,14 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
                goto out;
        error = -EINVAL;
-        if (!file->f_op->readdir)
+        if (!file->f_op->iterate)
                goto out_close;
-        buffer.name = name;
-        buffer.ino = child->d_inode->i_ino;
-        buffer.found = 0;
        buffer.sequence = 0;
        while (1) {
                int old_seq = buffer.sequence;
-                error = vfs_readdir(file, filldir_one, &buffer);
+                error = iterate_dir(file, &buffer.ctx);
                if (buffer.found) {
                        error = 0;
                        break;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 4237722bfd27..6e1d4ab09d72 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -287,17 +287,17 @@ static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
 }
 static int
-ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
+ext2_readdir(struct file *file, struct dir_context *ctx)
 {
-        loff_t pos = filp->f_pos;
+        loff_t pos = ctx->pos;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        unsigned int offset = pos & ~PAGE_CACHE_MASK;
        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned long npages = dir_pages(inode);
        unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
        unsigned char *types = NULL;
-        int need_revalidate = filp->f_version != inode->i_version;
+        int need_revalidate = file->f_version != inode->i_version;
        if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
                return 0;
@@ -314,16 +314,16 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
                        ext2_error(sb, __func__,
                                   "bad page in #%lu",
                                   inode->i_ino);
-                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        ctx->pos += PAGE_CACHE_SIZE - offset;
                        return PTR_ERR(page);
                }
                kaddr = page_address(page);
                if (unlikely(need_revalidate)) {
                        if (offset) {
                                offset = ext2_validate_entry(kaddr, offset, chunk_mask);
-                                filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+                                ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
                        }
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
                        need_revalidate = 0;
                }
                de = (ext2_dirent *)(kaddr+offset);
@@ -336,22 +336,19 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
                                return -EIO;
                        }
                        if (de->inode) {
-                                int over;
                                unsigned char d_type = DT_UNKNOWN;
                                if (types && de->file_type < EXT2_FT_MAX)
                                        d_type = types[de->file_type];
-                                offset = (char *)de - kaddr;
+                                if (!dir_emit(ctx, de->name, de->name_len,
-                                over = filldir(dirent, de->name, de->name_len,
+                                                le32_to_cpu(de->inode),
-                                                (n<<PAGE_CACHE_SHIFT) | offset,
+                                                d_type)) {
-                                                le32_to_cpu(de->inode), d_type);
-                                if (over) {
                                        ext2_put_page(page);
                                        return 0;
                                }
                        }
-                        filp->f_pos += ext2_rec_len_from_disk(de->rec_len);
+                        ctx->pos += ext2_rec_len_from_disk(de->rec_len);
                }
                ext2_put_page(page);
        }
@@ -724,7 +721,7 @@ not_empty:
 const struct file_operations ext2_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = ext2_readdir,
+        .iterate        = ext2_readdir,
        .unlocked_ioctl = ext2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 73b0d9519836..256dd5f4c1c4 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -119,6 +119,29 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
        return ext2_add_nondir(dentry, inode);
 }
+static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+        struct inode *inode = ext2_new_inode(dir, mode, NULL);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        inode->i_op = &ext2_file_inode_operations;
+        if (ext2_use_xip(inode->i_sb)) {
+                inode->i_mapping->a_ops = &ext2_aops_xip;
+                inode->i_fop = &ext2_xip_file_operations;
+        } else if (test_opt(inode->i_sb, NOBH)) {
+                inode->i_mapping->a_ops = &ext2_nobh_aops;
+                inode->i_fop = &ext2_file_operations;
+        } else {
+                inode->i_mapping->a_ops = &ext2_aops;
+                inode->i_fop = &ext2_file_operations;
+        }
+        mark_inode_dirty(inode);
+        d_tmpfile(dentry, inode);
+        unlock_new_inode(inode);
+        return 0;
+}
 static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
 {
        struct inode * inode;
@@ -398,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = {
 #endif
        .setattr        = ext2_setattr,
        .get_acl        = ext2_get_acl,
+        .tmpfile        = ext2_tmpfile,
 };
 const struct inode_operations ext2_special_inode_operations = {
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 87eccbbca255..f522425aaa24 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -28,8 +28,7 @@ static unsigned char ext3_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
-static int ext3_dx_readdir(struct file * filp,
+static int ext3_dx_readdir(struct file *, struct dir_context *);
-                           void * dirent, filldir_t filldir);
 static unsigned char get_dtype(struct super_block *sb, int filetype)
 {
@@ -91,36 +90,30 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
        return error_msg == NULL ? 1 : 0;
 }
-static int ext3_readdir(struct file * filp,
+static int ext3_readdir(struct file *file, struct dir_context *ctx)
-                         void * dirent, filldir_t filldir)
 {
-        int error = 0;
        unsigned long offset;
-        int i, stored;
+        int i;
        struct ext3_dir_entry_2 *de;
        int err;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
-        int ret = 0;
        int dir_has_error = 0;
        if (is_dx_dir(inode)) {
-                err = ext3_dx_readdir(filp, dirent, filldir);
+                err = ext3_dx_readdir(file, ctx);
-                if (err != ERR_BAD_DX_DIR) {
+                if (err != ERR_BAD_DX_DIR)
-                        ret = err;
+                        return err;
-                        goto out;
-                }
                /*
                 * We don't set the inode dirty flag since it's not
                 * critical that it get flushed back to the disk.
                 */
-                EXT3_I(file_inode(filp))->i_flags &= ~EXT3_INDEX_FL;
+                EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
        }
-        stored = 0;
+        offset = ctx->pos & (sb->s_blocksize - 1);
-        offset = filp->f_pos & (sb->s_blocksize - 1);
-        while (!error && !stored && filp->f_pos < inode->i_size) {
+        while (ctx->pos < inode->i_size) {
-                unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);
+                unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
                struct buffer_head map_bh;
                struct buffer_head *bh = NULL;
@@ -129,12 +122,12 @@ static int ext3_readdir(struct file * filp,
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
-                        if (!ra_has_index(&filp->f_ra, index))
+                        if (!ra_has_index(&file->f_ra, index))
                                page_cache_sync_readahead(
                                        sb->s_bdev->bd_inode->i_mapping,
-                                        &filp->f_ra, filp,
+                                        &file->f_ra, file,
                                        index, 1);
-                        filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+                        file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
                        bh = ext3_bread(NULL, inode, blk, 0, &err);
                }
@@ -146,22 +139,21 @@ static int ext3_readdir(struct file * filp,
                        if (!dir_has_error) {
                                ext3_error(sb, __func__, "directory #%lu "
                                        "contains a hole at offset %lld",
-                                        inode->i_ino, filp->f_pos);
+                                        inode->i_ino, ctx->pos);
                                dir_has_error = 1;
                        }
                        /* corrupt size?  Maybe no more blocks to read */
-                        if (filp->f_pos > inode->i_blocks << 9)
+                        if (ctx->pos > inode->i_blocks << 9)
                                break;
-                        filp->f_pos += sb->s_blocksize - offset;
+                        ctx->pos += sb->s_blocksize - offset;
                        continue;
                }
-revalidate:
                /* If the dir block has changed since the last call to
                 * readdir(2), then we might be pointing to an invalid
                 * dirent right now.  Scan from the start of the block
                 * to make sure. */
-                if (filp->f_version != inode->i_version) {
+                if (offset && file->f_version != inode->i_version) {
                        for (i = 0; i < sb->s_blocksize && i < offset; ) {
                                de = (struct ext3_dir_entry_2 *)
                                        (bh->b_data + i);
@@ -177,53 +169,40 @@ revalidate:
                                i += ext3_rec_len_from_disk(de->rec_len);
                        }
                        offset = i;
-                        filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+                        ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
                                | offset;
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
                }
-                while (!error && filp->f_pos < inode->i_size
+                while (ctx->pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
                        if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
                                                   bh, offset)) {
-                                /* On error, skip the f_pos to the
+                                /* On error, skip the to the
                                   next block. */
-                                filp->f_pos = (filp->f_pos |
+                                ctx->pos = (ctx->pos |
                                                (sb->s_blocksize - 1)) + 1;
-                                brelse (bh);
+                                break;
-                                ret = stored;
-                                goto out;
                        }
                        offset += ext3_rec_len_from_disk(de->rec_len);
                        if (le32_to_cpu(de->inode)) {
-                                /* We might block in the next section
+                                if (!dir_emit(ctx, de->name, de->name_len,
-                                 * if the data destination is
+                                              le32_to_cpu(de->inode),
-                                 * currently swapped out.  So, use a
+                                              get_dtype(sb, de->file_type))) {
-                                 * version stamp to detect whether or
+                                        brelse(bh);
-                                 * not the directory has been modified
+                                        return 0;
-                                 * during the copy operation.
+                                }
-                                 */
-                                u64 version = filp->f_version;
-                                error = filldir(dirent, de->name,
-                                                de->name_len,
-                                                filp->f_pos,
-                                                le32_to_cpu(de->inode),
-                                                get_dtype(sb, de->file_type));
-                                if (error)
-                                        break;
-                                if (version != filp->f_version)
-                                        goto revalidate;
-                                stored ++;
                        }
-                        filp->f_pos += ext3_rec_len_from_disk(de->rec_len);
+                        ctx->pos += ext3_rec_len_from_disk(de->rec_len);
                }
                offset = 0;
                brelse (bh);
+                if (ctx->pos < inode->i_size)
+                        if (!dir_relax(inode))
+                                return 0;
        }
-out:
+        return 0;
-        return ret;
 }
 static inline int is_32bit_api(void)
@@ -452,62 +431,54 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
 * for all entres on the fname linked list.  (Normally there is only
 * one entry on the linked list, unless there are 62 bit hash collisions.)
 */
-static int call_filldir(struct file * filp, void * dirent,
+static bool call_filldir(struct file *file, struct dir_context *ctx,
-                        filldir_t filldir, struct fname *fname)
+                        struct fname *fname)
 {
-        struct dir_private_info *info = filp->private_data;
+        struct dir_private_info *info = file->private_data;
-        loff_t  curr_pos;
+        struct inode *inode = file_inode(file);
-        struct inode *inode = file_inode(filp);
+        struct super_block *sb = inode->i_sb;
-        struct super_block * sb;
-        int error;
-        sb = inode->i_sb;
        if (!fname) {
                printk("call_filldir: called with null fname?!?\n");
-                return 0;
+                return true;
        }
-        curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+        ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
        while (fname) {
-                error = filldir(dirent, fname->name,
+                if (!dir_emit(ctx, fname->name, fname->name_len,
-                                fname->name_len, curr_pos,
                                fname->inode,
-                                get_dtype(sb, fname->file_type));
+                                get_dtype(sb, fname->file_type))) {
-                if (error) {
-                        filp->f_pos = curr_pos;
                        info->extra_fname = fname;
-                        return error;
+                        return false;
                }
                fname = fname->next;
        }
-        return 0;
+        return true;
 }
-static int ext3_dx_readdir(struct file * filp,
+static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
-                         void * dirent, filldir_t filldir)
 {
-        struct dir_private_info *info = filp->private_data;
+        struct dir_private_info *info = file->private_data;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct fname *fname;
        int     ret;
        if (!info) {
-                info = ext3_htree_create_dir_info(filp, filp->f_pos);
+                info = ext3_htree_create_dir_info(file, ctx->pos);
                if (!info)
                        return -ENOMEM;
-                filp->private_data = info;
+                file->private_data = info;
        }
-        if (filp->f_pos == ext3_get_htree_eof(filp))
+        if (ctx->pos == ext3_get_htree_eof(file))
                return 0;       /* EOF */
        /* Some one has messed with f_pos; reset the world */
-        if (info->last_pos != filp->f_pos) {
+        if (info->last_pos != ctx->pos) {
                free_rb_tree_fname(&info->root);
                info->curr_node = NULL;
                info->extra_fname = NULL;
-                info->curr_hash = pos2maj_hash(filp, filp->f_pos);
+                info->curr_hash = pos2maj_hash(file, ctx->pos);
-                info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+                info->curr_minor_hash = pos2min_hash(file, ctx->pos);
        }
        /*
@@ -515,7 +486,7 @@ static int ext3_dx_readdir(struct file * filp,
         * chain, return them first.
         */
        if (info->extra_fname) {
-                if (call_filldir(filp, dirent, filldir, info->extra_fname))
+                if (!call_filldir(file, ctx, info->extra_fname))
                        goto finished;
                info->extra_fname = NULL;
                goto next_node;
@@ -529,17 +500,17 @@ static int ext3_dx_readdir(struct file * filp,
                 * cached entries.
                 */
                if ((!info->curr_node) ||
-                    (filp->f_version != inode->i_version)) {
+                    (file->f_version != inode->i_version)) {
                        info->curr_node = NULL;
                        free_rb_tree_fname(&info->root);
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
-                        ret = ext3_htree_fill_tree(filp, info->curr_hash,
+                        ret = ext3_htree_fill_tree(file, info->curr_hash,
                                                   info->curr_minor_hash,
                                                   &info->next_hash);
                        if (ret < 0)
                                return ret;
                        if (ret == 0) {
-                                filp->f_pos = ext3_get_htree_eof(filp);
+                                ctx->pos = ext3_get_htree_eof(file);
                                break;
                        }
                        info->curr_node = rb_first(&info->root);
@@ -548,7 +519,7 @@ static int ext3_dx_readdir(struct file * filp,
                fname = rb_entry(info->curr_node, struct fname, rb_hash);
                info->curr_hash = fname->hash;
                info->curr_minor_hash = fname->minor_hash;
-                if (call_filldir(filp, dirent, filldir, fname))
+                if (!call_filldir(file, ctx, fname))
                        break;
        next_node:
                info->curr_node = rb_next(info->curr_node);
@@ -559,7 +530,7 @@ static int ext3_dx_readdir(struct file * filp,
                        info->curr_minor_hash = fname->minor_hash;
                } else {
                        if (info->next_hash == ~0) {
-                                filp->f_pos = ext3_get_htree_eof(filp);
+                                ctx->pos = ext3_get_htree_eof(file);
                                break;
                        }
                        info->curr_hash = info->next_hash;
@@ -567,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp,
                }
        }
 finished:
-        info->last_pos = filp->f_pos;
+        info->last_pos = ctx->pos;
        return 0;
 }
@@ -582,7 +553,7 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
 const struct file_operations ext3_dir_operations = {
        .llseek         = ext3_dir_llseek,
        .read           = generic_read_dir,
-        .readdir        = ext3_readdir,
+        .iterate        = ext3_readdir,
        .unlocked_ioctl = ext3_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext3_compat_ioctl,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index b31dbd4c46ad..1cb9c7e10c6f 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -48,9 +48,13 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        trace_ext3_sync_file_enter(file, datasync);
-        if (inode->i_sb->s_flags & MS_RDONLY)
+        if (inode->i_sb->s_flags & MS_RDONLY) {
+                /* Make sure that we read updated state */
+                smp_rmb();
+                if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
+                        return -EROFS;
                return 0;
+        }
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (ret)
                goto out;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 23c712825640..2bd85486b879 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1825,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
 }
-static void ext3_invalidatepage(struct page *page, unsigned long offset)
+static void ext3_invalidatepage(struct page *page, unsigned int offset,
+                                unsigned int length)
 {
        journal_t *journal = EXT3_JOURNAL(page->mapping->host);
-        trace_ext3_invalidatepage(page, offset);
+        trace_ext3_invalidatepage(page, offset, length);
        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
-        if (offset == 0)
+        if (offset == 0 && length == PAGE_CACHE_SIZE)
                ClearPageChecked(page);
-        journal_invalidatepage(journal, page, offset);
+        journal_invalidatepage(journal, page, offset, length);
 }
 static int ext3_releasepage(struct page *page, gfp_t wait)
@@ -1984,6 +1985,7 @@ static const struct address_space_operations ext3_ordered_aops = {
        .direct_IO              = ext3_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+        .is_dirty_writeback     = buffer_check_dirty_writeback,
        .error_remove_page      = generic_error_remove_page,
 };
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 692de13e3596..1194b1f0f839 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -576,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
                                        (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
                                                +((char *)de - bh->b_data))) {
-                        /* On error, skip the f_pos to the next block. */
+                        /* silently ignore the rest of the block */
-                        dir_file->f_pos = (dir_file->f_pos |
+                        break;
-                                        (dir->i_sb->s_blocksize - 1)) + 1;
-                        brelse (bh);
-                        return count;
                }
                ext3fs_dirhash(de->name, de->name_len, hinfo);
                if ((hinfo->hash < start_hash) ||
@@ -1762,6 +1759,45 @@ retry:
        return err;
 }
+static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+        handle_t *handle;
+        struct inode *inode;
+        int err, retries = 0;
+        dquot_initialize(dir);
+retry:
+        handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+                          4 + EXT3_XATTR_TRANS_BLOCKS);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        inode = ext3_new_inode (handle, dir, NULL, mode);
+        err = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                inode->i_op = &ext3_file_inode_operations;
+                inode->i_fop = &ext3_file_operations;
+                ext3_set_aops(inode);
+                d_tmpfile(dentry, inode);
+                err = ext3_orphan_add(handle, inode);
+                if (err)
+                        goto err_drop_inode;
+                mark_inode_dirty(inode);
+                unlock_new_inode(inode);
+        }
+        ext3_journal_stop(handle);
+        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+                goto retry;
+        return err;
+err_drop_inode:
+        ext3_journal_stop(handle);
+        unlock_new_inode(inode);
+        iput(inode);
+        return err;
+}
 static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
 {
        handle_t *handle;
@@ -2303,7 +2339,7 @@ static int ext3_link (struct dentry * old_dentry,
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS);
+                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2317,6 +2353,11 @@ retry:
        err = ext3_add_entry(handle, dentry, inode);
        if (!err) {
                ext3_mark_inode_dirty(handle, inode);
+                /* this can happen only for tmpfile being
+                 * linked the first time
+                 */
+                if (inode->i_nlink == 1)
+                        ext3_orphan_del(handle, inode);
                d_instantiate(dentry, inode);
        } else {
                drop_nlink(inode);
@@ -2519,6 +2560,7 @@ const struct inode_operations ext3_dir_inode_operations = {
        .mkdir          = ext3_mkdir,
        .rmdir          = ext3_rmdir,
        .mknod          = ext3_mknod,
+        .tmpfile        = ext3_tmpfile,
        .rename         = ext3_rename,
        .setattr        = ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6356665a74bb..c47f14750722 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -174,6 +174,11 @@ static void ext3_handle_error(struct super_block *sb)
        if (test_opt (sb, ERRORS_RO)) {
                ext3_msg(sb, KERN_CRIT,
                        "error: remounting filesystem read-only");
+                /*
+                 * Make sure updated value of ->s_mount_state will be visible
+                 * before ->s_flags update.
+                 */
+                smp_wmb();
                sb->s_flags |= MS_RDONLY;
        }
        ext3_commit_super(sb, es, 1);
@@ -291,8 +296,14 @@ void ext3_abort(struct super_block *sb, const char *function,
        ext3_msg(sb, KERN_CRIT,
                "error: remounting filesystem read-only");
        EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
-        sb->s_flags |= MS_RDONLY;
        set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
+        /*
+         * Make sure updated value of ->s_mount_state will be visible
+         * before ->s_flags update.
+         */
+        smp_wmb();
+        sb->s_flags |= MS_RDONLY;
        if (EXT3_SB(sb)->s_journal)
                journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d0f13eada0ed..ddd715e42a5c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -38,8 +38,8 @@ ext4_group_t ext4_get_group_number(struct super_block *sb,
        ext4_group_t group;
        if (test_opt2(sb, STD_GROUP_SIZE))
-                group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+                group = (block -
-                         block) >>
+                         le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
                        (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
        else
                ext4_get_group_no_and_offset(sb, block, &group, NULL);
@@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
 static inline int test_root(ext4_group_t a, int b)
 {
-        int num = b;
+        while (1) {
+                if (a < b)
-        while (a > num)
+                        return 0;
-                num *= b;
+                if (a == b)
-        return num == a;
+                        return 1;
+                if ((a % b) != 0)
+                        return 0;
+                a = a / b;
+        }
 }
 static int ext4_group_sparse(ext4_group_t group)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f8d56e4254e0..3c7d288ae94c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -29,8 +29,7 @@
 #include "ext4.h"
 #include "xattr.h"
-static int ext4_dx_readdir(struct file *filp,
+static int ext4_dx_readdir(struct file *, struct dir_context *);
-                           void *dirent, filldir_t filldir);
 /**
 * Check if the given dir-inode refers to an htree-indexed directory
@@ -103,60 +102,56 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
        return 1;
 }
-static int ext4_readdir(struct file *filp,
+static int ext4_readdir(struct file *file, struct dir_context *ctx)
-                         void *dirent, filldir_t filldir)
 {
-        int error = 0;
        unsigned int offset;
        int i, stored;
        struct ext4_dir_entry_2 *de;
        int err;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
-        int ret = 0;
        int dir_has_error = 0;
        if (is_dx_dir(inode)) {
-                err = ext4_dx_readdir(filp, dirent, filldir);
+                err = ext4_dx_readdir(file, ctx);
                if (err != ERR_BAD_DX_DIR) {
-                        ret = err;
+                        return err;
-                        goto out;
                }
                /*
                 * We don't set the inode dirty flag since it's not
                 * critical that it get flushed back to the disk.
                 */
-                ext4_clear_inode_flag(file_inode(filp),
+                ext4_clear_inode_flag(file_inode(file),
                                      EXT4_INODE_INDEX);
        }
        if (ext4_has_inline_data(inode)) {
                int has_inline_data = 1;
-                ret = ext4_read_inline_dir(filp, dirent, filldir,
+                int ret = ext4_read_inline_dir(file, ctx,
                                           &has_inline_data);
                if (has_inline_data)
                        return ret;
        }
        stored = 0;
-        offset = filp->f_pos & (sb->s_blocksize - 1);
+        offset = ctx->pos & (sb->s_blocksize - 1);
-        while (!error && !stored && filp->f_pos < inode->i_size) {
+        while (ctx->pos < inode->i_size) {
                struct ext4_map_blocks map;
                struct buffer_head *bh = NULL;
-                map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+                map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
                map.m_len = 1;
                err = ext4_map_blocks(NULL, inode, &map, 0);
                if (err > 0) {
                        pgoff_t index = map.m_pblk >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
-                        if (!ra_has_index(&filp->f_ra, index))
+                        if (!ra_has_index(&file->f_ra, index))
                                page_cache_sync_readahead(
                                        sb->s_bdev->bd_inode->i_mapping,
-                                        &filp->f_ra, filp,
+                                        &file->f_ra, file,
                                        index, 1);
-                        filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+                        file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
                        bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
                }
@@ -166,16 +161,16 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                                EXT4_ERROR_FILE(filp, 0,
+                                EXT4_ERROR_FILE(file, 0,
                                                "directory contains a "
                                                "hole at offset %llu",
-                                           (unsigned long long) filp->f_pos);
+                                           (unsigned long long) ctx->pos);
                                dir_has_error = 1;
                        }
                        /* corrupt size?  Maybe no more blocks to read */
-                        if (filp->f_pos > inode->i_blocks << 9)
+                        if (ctx->pos > inode->i_blocks << 9)
                                break;
-                        filp->f_pos += sb->s_blocksize - offset;
+                        ctx->pos += sb->s_blocksize - offset;
                        continue;
                }
@@ -183,21 +178,20 @@ static int ext4_readdir(struct file *filp,
                if (!buffer_verified(bh) &&
                    !ext4_dirent_csum_verify(inode,
                                (struct ext4_dir_entry *)bh->b_data)) {
-                        EXT4_ERROR_FILE(filp, 0, "directory fails checksum "
+                        EXT4_ERROR_FILE(file, 0, "directory fails checksum "
                                        "at offset %llu",
-                                        (unsigned long long)filp->f_pos);
+                                        (unsigned long long)ctx->pos);
-                        filp->f_pos += sb->s_blocksize - offset;
+                        ctx->pos += sb->s_blocksize - offset;
                        brelse(bh);
                        continue;
                }
                set_buffer_verified(bh);
-revalidate:
                /* If the dir block has changed since the last call to
                 * readdir(2), then we might be pointing to an invalid
                 * dirent right now.  Scan from the start of the block
                 * to make sure. */
-                if (filp->f_version != inode->i_version) {
+                if (file->f_version != inode->i_version) {
                        for (i = 0; i < sb->s_blocksize && i < offset; ) {
                                de = (struct ext4_dir_entry_2 *)
                                        (bh->b_data + i);
@@ -214,57 +208,46 @@ revalidate:
                                                            sb->s_blocksize);
                        }
                        offset = i;
-                        filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+                        ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
                                | offset;
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
                }
-                while (!error && filp->f_pos < inode->i_size
+                while (ctx->pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                        if (ext4_check_dir_entry(inode, filp, de, bh,
+                        if (ext4_check_dir_entry(inode, file, de, bh,
                                                 bh->b_data, bh->b_size,
                                                 offset)) {
                                /*
-                                 * On error, skip the f_pos to the next block
+                                 * On error, skip to the next block
                                 */
-                                filp->f_pos = (filp->f_pos |
+                                ctx->pos = (ctx->pos |
                                                (sb->s_blocksize - 1)) + 1;
-                                brelse(bh);
+                                break;
-                                ret = stored;
-                                goto out;
                        }
                        offset += ext4_rec_len_from_disk(de->rec_len,
                                        sb->s_blocksize);
                        if (le32_to_cpu(de->inode)) {
-                                /* We might block in the next section
+                                if (!dir_emit(ctx, de->name,
-                                 * if the data destination is
-                                 * currently swapped out.  So, use a
-                                 * version stamp to detect whether or
-                                 * not the directory has been modified
-                                 * during the copy operation.
-                                 */
-                                u64 version = filp->f_version;
-                                error = filldir(dirent, de->name,
                                                de->name_len,
-                                                filp->f_pos,
                                                le32_to_cpu(de->inode),
-                                                get_dtype(sb, de->file_type));
+                                                get_dtype(sb, de->file_type))) {
-                                if (error)
+                                        brelse(bh);
-                                        break;
+                                        return 0;
-                                if (version != filp->f_version)
+                                }
-                                        goto revalidate;
-                                stored++;
                        }
-                        filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+                        ctx->pos += ext4_rec_len_from_disk(de->rec_len,
                                                sb->s_blocksize);
                }
                offset = 0;
                brelse(bh);
+                if (ctx->pos < inode->i_size) {
+                        if (!dir_relax(inode))
+                                return 0;
+                }
        }
-out:
+        return 0;
-        return ret;
 }
 static inline int is_32bit_api(void)
@@ -492,16 +475,12 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 * for all entres on the fname linked list.  (Normally there is only
 * one entry on the linked list, unless there are 62 bit hash collisions.)
 */
-static int call_filldir(struct file *filp, void *dirent,
+static int call_filldir(struct file *file, struct dir_context *ctx,
-                        filldir_t filldir, struct fname *fname)
+                        struct fname *fname)
 {
-        struct dir_private_info *info = filp->private_data;
+        struct dir_private_info *info = file->private_data;
-        loff_t  curr_pos;
+        struct inode *inode = file_inode(file);
-        struct inode *inode = file_inode(filp);
+        struct super_block *sb = inode->i_sb;
-        struct super_block *sb;
-        int error;
-        sb = inode->i_sb;
        if (!fname) {
                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
@@ -509,47 +488,44 @@ static int call_filldir(struct file *filp, void *dirent,
                         inode->i_ino, current->comm);
                return 0;
        }
-        curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+        ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
        while (fname) {
-                error = filldir(dirent, fname->name,
+                if (!dir_emit(ctx, fname->name,
-                                fname->name_len, curr_pos,
+                                fname->name_len,
                                fname->inode,
-                                get_dtype(sb, fname->file_type));
+                                get_dtype(sb, fname->file_type))) {
-                if (error) {
-                        filp->f_pos = curr_pos;
                        info->extra_fname = fname;
-                        return error;
+                        return 1;
                }
                fname = fname->next;
        }
        return 0;
 }
-static int ext4_dx_readdir(struct file *filp,
+static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
-                         void *dirent, filldir_t filldir)
 {
-        struct dir_private_info *info = filp->private_data;
+        struct dir_private_info *info = file->private_data;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct fname *fname;
        int     ret;
        if (!info) {
-                info = ext4_htree_create_dir_info(filp, filp->f_pos);
+                info = ext4_htree_create_dir_info(file, ctx->pos);
                if (!info)
                        return -ENOMEM;
-                filp->private_data = info;
+                file->private_data = info;
        }
-        if (filp->f_pos == ext4_get_htree_eof(filp))
+        if (ctx->pos == ext4_get_htree_eof(file))
                return 0;       /* EOF */
        /* Some one has messed with f_pos; reset the world */
-        if (info->last_pos != filp->f_pos) {
+        if (info->last_pos != ctx->pos) {
                free_rb_tree_fname(&info->root);
                info->curr_node = NULL;
                info->extra_fname = NULL;
-                info->curr_hash = pos2maj_hash(filp, filp->f_pos);
+                info->curr_hash = pos2maj_hash(file, ctx->pos);
-                info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+                info->curr_minor_hash = pos2min_hash(file, ctx->pos);
        }
        /*
@@ -557,7 +533,7 @@ static int ext4_dx_readdir(struct file *filp,
         * chain, return them first.
         */
        if (info->extra_fname) {
-                if (call_filldir(filp, dirent, filldir, info->extra_fname))
+                if (call_filldir(file, ctx, info->extra_fname))
                        goto finished;
                info->extra_fname = NULL;
                goto next_node;
@@ -571,17 +547,17 @@ static int ext4_dx_readdir(struct file *filp,
                 * cached entries.
                 */
                if ((!info->curr_node) ||
-                    (filp->f_version != inode->i_version)) {
+                    (file->f_version != inode->i_version)) {
                        info->curr_node = NULL;
                        free_rb_tree_fname(&info->root);
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
-                        ret = ext4_htree_fill_tree(filp, info->curr_hash,
+                        ret = ext4_htree_fill_tree(file, info->curr_hash,
                                                   info->curr_minor_hash,
                                                   &info->next_hash);
                        if (ret < 0)
                                return ret;
                        if (ret == 0) {
-                                filp->f_pos = ext4_get_htree_eof(filp);
+                                ctx->pos = ext4_get_htree_eof(file);
                                break;
                        }
                        info->curr_node = rb_first(&info->root);
@@ -590,7 +566,7 @@ static int ext4_dx_readdir(struct file *filp,
                fname = rb_entry(info->curr_node, struct fname, rb_hash);
                info->curr_hash = fname->hash;
                info->curr_minor_hash = fname->minor_hash;
-                if (call_filldir(filp, dirent, filldir, fname))
+                if (call_filldir(file, ctx, fname))
                        break;
        next_node:
                info->curr_node = rb_next(info->curr_node);
@@ -601,7 +577,7 @@ static int ext4_dx_readdir(struct file *filp,
                        info->curr_minor_hash = fname->minor_hash;
                } else {
                        if (info->next_hash == ~0) {
-                                filp->f_pos = ext4_get_htree_eof(filp);
+                                ctx->pos = ext4_get_htree_eof(file);
                                break;
                        }
                        info->curr_hash = info->next_hash;
@@ -609,7 +585,7 @@ static int ext4_dx_readdir(struct file *filp,
                }
        }
 finished:
-        info->last_pos = filp->f_pos;
+        info->last_pos = ctx->pos;
        return 0;
 }
@@ -624,7 +600,7 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
 const struct file_operations ext4_dir_operations = {
        .llseek         = ext4_dir_llseek,
        .read           = generic_read_dir,
-        .readdir        = ext4_readdir,
+        .iterate        = ext4_readdir,
        .unlocked_ioctl = ext4_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext4_compat_ioctl,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5aae3d12d400..b577e45425b0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,38 +177,28 @@ struct ext4_map_blocks {
 };
 /*
- * For delayed allocation tracking
- */
-struct mpage_da_data {
-        struct inode *inode;
-        sector_t b_blocknr;             /* start block number of extent */
-        size_t b_size;                  /* size of extent */
-        unsigned long b_state;          /* state of the extent */
-        unsigned long first_page, next_page;    /* extent of pages */
-        struct writeback_control *wbc;
-        int io_done;
-        int pages_written;
-        int retval;
-};
-/*
 * Flags for ext4_io_end->flags
 */
 #define EXT4_IO_END_UNWRITTEN   0x0001
-#define EXT4_IO_END_ERROR       0x0002
+#define EXT4_IO_END_DIRECT      0x0002
-#define EXT4_IO_END_DIRECT      0x0004
 /*
- * For converting uninitialized extents on a work queue.
+ * For converting uninitialized extents on a work queue. 'handle' is used for
+ * buffered writeback.
 */
 typedef struct ext4_io_end {
        struct list_head        list;           /* per-file finished IO list */
+        handle_t                *handle;        /* handle reserved for extent
+                                                 * conversion */
        struct inode            *inode;         /* file being written to */
+        struct bio              *bio;           /* Linked list of completed
+                                                 * bios covering the extent */
        unsigned int            flag;           /* unwritten or not */
        loff_t                  offset;         /* offset in the file */
        ssize_t                 size;           /* size of the extent */
        struct kiocb            *iocb;          /* iocb struct for AIO */
        int                     result;         /* error value for AIO */
+        atomic_t                count;          /* reference counter */
 } ext4_io_end_t;
 struct ext4_io_submit {
@@ -581,11 +571,6 @@ enum {
 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER    0x0020
 /*
- * Flags used by ext4_discard_partial_page_buffers
- */
-#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED   0x0001
-/*
 * ioctl commands
 */
 #define EXT4_IOC_GETFLAGS               FS_IOC_GETFLAGS
@@ -879,6 +864,7 @@ struct ext4_inode_info {
        rwlock_t i_es_lock;
        struct list_head i_es_lru;
        unsigned int i_es_lru_nr;       /* protected by i_es_lock */
+        unsigned long i_touch_when;     /* jiffies of last accessing */
        /* ialloc */
        ext4_group_t    i_last_alloc_group;
@@ -903,12 +889,22 @@ struct ext4_inode_info {
        qsize_t i_reserved_quota;
 #endif
-        /* completed IOs that might need unwritten extents handling */
+        /* Lock protecting lists below */
-        struct list_head i_completed_io_list;
        spinlock_t i_completed_io_lock;
+        /*
+         * Completed IOs that need unwritten extents handling and have
+         * transaction reserved
+         */
+        struct list_head i_rsv_conversion_list;
+        /*
+         * Completed IOs that need unwritten extents handling and don't have
+         * transaction reserved
+         */
+        struct list_head i_unrsv_conversion_list;
        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        atomic_t i_unwritten; /* Nr. of inflight conversions pending */
-        struct work_struct i_unwritten_work;    /* deferred extent conversion */
+        struct work_struct i_rsv_conversion_work;
+        struct work_struct i_unrsv_conversion_work;
        spinlock_t i_block_reservation_lock;
@@ -1245,7 +1241,6 @@ struct ext4_sb_info {
        unsigned int s_mb_stats;
        unsigned int s_mb_order2_reqs;
        unsigned int s_mb_group_prealloc;
-        unsigned int s_max_writeback_mb_bump;
        unsigned int s_max_dir_size_kb;
        /* where last allocation was done - for stream allocation */
        unsigned long s_mb_last_group;
@@ -1281,8 +1276,10 @@ struct ext4_sb_info {
        struct flex_groups *s_flex_groups;
        ext4_group_t s_flex_groups_allocated;
-        /* workqueue for dio unwritten */
+        /* workqueue for unreserved extent convertions (dio) */
-        struct workqueue_struct *dio_unwritten_wq;
+        struct workqueue_struct *unrsv_conversion_wq;
+        /* workqueue for reserved extent conversions (buffered io) */
+        struct workqueue_struct *rsv_conversion_wq;
        /* timer for periodic error stats printing */
        struct timer_list s_err_report;
@@ -1307,6 +1304,7 @@ struct ext4_sb_info {
        /* Reclaim extents from extent status tree */
        struct shrinker s_es_shrinker;
        struct list_head s_es_lru;
+        unsigned long s_es_last_sorted;
        struct percpu_counter s_extent_cache_cnt;
        spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
 };
@@ -1342,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
                                              struct ext4_io_end *io_end)
 {
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+                /* Writeback has to have coversion transaction reserved */
+                WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
+                        !(io_end->flag & EXT4_IO_END_DIRECT));
                io_end->flag |= EXT4_IO_END_UNWRITTEN;
                atomic_inc(&EXT4_I(inode)->i_unwritten);
        }
@@ -1999,7 +2000,6 @@ static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_unwritten_io(struct inode *);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2088,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
-extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2096,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_discard_partial_page_buffers(handle_t *handle,
+extern int ext4_block_truncate_page(handle_t *handle,
-                struct address_space *mapping, loff_t from,
+                struct address_space *mapping, loff_t from);
-                loff_t length, int flags);
+extern int ext4_block_zero_page_range(handle_t *handle,
+                struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+                             loff_t lstart, loff_t lend);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2111,7 +2114,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
                                const struct iovec *iov, loff_t offset,
                                unsigned long nr_segs);
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
-extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
 extern void ext4_ind_truncate(handle_t *, struct inode *inode);
 extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
                                 ext4_lblk_t first, ext4_lblk_t stop);
@@ -2166,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
                                    ext4_group_t ngroup);
 extern const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16]);
 extern __printf(4, 5)
 void __ext4_error(struct super_block *, const char *, unsigned int,
                  const char *, ...);
-#define ext4_error(sb, message...)      __ext4_error(sb, __func__,      \
-                                                     __LINE__, ## message)
 extern __printf(5, 6)
-void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
                      const char *, ...);
 extern __printf(5, 6)
-void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
                     const char *, ...);
 extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
 extern __printf(4, 5)
 void __ext4_abort(struct super_block *, const char *, unsigned int,
                  const char *, ...);
-#define ext4_abort(sb, message...)      __ext4_abort(sb, __func__, \
-                                                       __LINE__, ## message)
 extern __printf(4, 5)
 void __ext4_warning(struct super_block *, const char *, unsigned int,
                    const char *, ...);
-#define ext4_warning(sb, message...)    __ext4_warning(sb, __func__, \
-                                                       __LINE__, ## message)
 extern __printf(3, 4)
-void ext4_msg(struct super_block *, const char *, const char *, ...);
+void __ext4_msg(struct super_block *, const char *, const char *, ...);
 extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
                           const char *, unsigned int, const char *);
-#define dump_mmp_msg(sb, mmp, msg)      __dump_mmp_msg(sb, mmp, __func__, \
-                                                       __LINE__, msg)
 extern __printf(7, 8)
 void __ext4_grp_locked_error(const char *, unsigned int,
                             struct super_block *, ext4_group_t,
                             unsigned long, ext4_fsblk_t,
                             const char *, ...);
-#define ext4_grp_locked_error(sb, grp, message...) \
-        __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
+#ifdef CONFIG_PRINTK
+#define ext4_error_inode(inode, func, line, block, fmt, ...)            \
+        __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error_file(file, func, line, block, fmt, ...)              \
+        __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error(sb, fmt, ...)                                        \
+        __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_abort(sb, fmt, ...)                                        \
+        __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning(sb, fmt, ...)                                      \
+        __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_msg(sb, level, fmt, ...)                           \
+        __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
+#define dump_mmp_msg(sb, mmp, msg)                                      \
+        __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)            \
+        __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
+                                fmt, ##__VA_ARGS__)
+#else
+#define ext4_error_inode(inode, func, line, block, fmt, ...)            \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                                  \
+        __ext4_error_inode(inode, "", 0, block, " ");                   \
+} while (0)
+#define ext4_error_file(file, func, line, block, fmt, ...)              \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                                  \
+        __ext4_error_file(file, "", 0, block, " ");                     \
+} while (0)
+#define ext4_error(sb, fmt, ...)                                        \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                                  \
+        __ext4_error(sb, "", 0, " ");                                   \
+} while (0)
+#define ext4_abort(sb, fmt, ...)                                        \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                                  \
+        __ext4_abort(sb, "", 0, " ");                                   \
+} while (0)
+#define ext4_warning(sb, fmt, ...)                                      \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                                  \
+        __ext4_warning(sb, "", 0, " ");                                 \
+} while (0)
+#define ext4_msg(sb, level, fmt, ...)                                   \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                                  \
+        __ext4_msg(sb, "", " ");                                        \
+} while (0)
+#define dump_mmp_msg(sb, mmp, msg)                                      \
+        __dump_mmp_msg(sb, mmp, "", 0, "")
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)            \
+do {                                                                    \
+        no_printk(fmt, ##__VA_ARGS__);                          \
+        __ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");       \
+} while (0)
+#endif
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
                                        __u32 compat);
@@ -2312,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 {
         struct ext4_group_info ***grp_info;
         long indexv, indexh;
+         BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
         grp_info = EXT4_SB(sb)->s_group_info;
         indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
         indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2515,7 +2573,7 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
                                      struct inode *parent,
                                      struct inode *inode);
 extern int ext4_read_inline_dir(struct file *filp,
-                                void *dirent, filldir_t filldir,
+                                struct dir_context *ctx,
                                int *has_inline_data);
 extern int htree_inlinedir_to_tree(struct file *dir_file,
                                   struct inode *dir, ext4_lblk_t block,
@@ -2598,8 +2656,7 @@ struct ext4_extent;
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
-extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
-                                       int chunk);
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                               struct ext4_map_blocks *map, int flags);
 extern void ext4_ext_truncate(handle_t *, struct inode *);
@@ -2609,8 +2666,8 @@ extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
-extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
-                          ssize_t len);
+                                          loff_t offset, ssize_t len);
 extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2650,12 +2707,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
-extern void ext4_add_complete_io(ext4_io_end_t *io_end);
 extern void ext4_exit_pageio(void);
-extern void ext4_ioend_shutdown(struct inode *);
-extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern void ext4_end_io_work(struct work_struct *work);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+                                struct writeback_control *wbc);
+extern void ext4_end_io_rsv_work(struct work_struct *work);
+extern void ext4_end_io_unrsv_work(struct work_struct *work);
 extern void ext4_io_submit(struct ext4_io_submit *io);
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
                               struct page *page,
@@ -2668,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
 extern int ext4_mmp_csum_verify(struct super_block *sb,
                                struct mmp_struct *mmp);
-/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+/*
+ * Note that these flags will never ever appear in a buffer_head's state flag.
+ * See EXT4_MAP_... to see where this is used.
+ */
 enum ext4_state_bits {
        BH_Uninit       /* blocks are allocated but uninitialized on disk */
-          = BH_JBDPrivateStart,
+         = BH_JBDPrivateStart,
        BH_AllocFromCluster,    /* allocated blocks were part of already
-                                 * allocated cluster. Note that this flag will
+                                 * allocated cluster. */
-                                 * never, ever appear in a buffer_head's state
-                                 * flag. See EXT4_MAP_FROM_CLUSTER to see where
-                                 * this is used. */
 };
-BUFFER_FNS(Uninit, uninit)
-TAS_BUFFER_FNS(Uninit, uninit)
 /*
 * Add new method to test whether block and inode bitmaps are properly
 * initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 451eb4045330..72a3600aedbd 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle)
 /*
 * Wrappers for jbd2_journal_start/end.
 */
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+static int ext4_journal_check_start(struct super_block *sb)
-                                  int type, int nblocks)
 {
        journal_t *journal;
        might_sleep();
-        trace_ext4_journal_start(sb, nblocks, _RET_IP_);
        if (sb->s_flags & MS_RDONLY)
-                return ERR_PTR(-EROFS);
+                return -EROFS;
        WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
        journal = EXT4_SB(sb)->s_journal;
-        if (!journal)
-                return ext4_get_nojournal();
        /*
         * Special case here: if the journal has aborted behind our
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly.
         */
-        if (is_journal_aborted(journal)) {
+        if (journal && is_journal_aborted(journal)) {
                ext4_abort(sb, "Detected aborted journal");
-                return ERR_PTR(-EROFS);
+                return -EROFS;
        }
-        return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line);
+        return 0;
+}
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+                                  int type, int blocks, int rsv_blocks)
+{
+        journal_t *journal;
+        int err;
+        trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+        err = ext4_journal_check_start(sb);
+        if (err < 0)
+                return ERR_PTR(err);
+        journal = EXT4_SB(sb)->s_journal;
+        if (!journal)
+                return ext4_get_nojournal();
+        return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
+                                   type, line);
 }
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -86,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
        return err;
 }
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+                                        int type)
+{
+        struct super_block *sb;
+        int err;
+        if (!ext4_handle_valid(handle))
+                return ext4_get_nojournal();
+        sb = handle->h_journal->j_private;
+        trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
+                                          _RET_IP_);
+        err = ext4_journal_check_start(sb);
+        if (err < 0) {
+                jbd2_journal_free_reserved(handle);
+                return ERR_PTR(err);
+        }
+        err = jbd2_journal_start_reserved(handle, type, line);
+        if (err < 0)
+                return ERR_PTR(err);
+        return handle;
+}
 void ext4_journal_abort_handle(const char *caller, unsigned int line,
                               const char *err_fn, struct buffer_head *bh,
                               handle_t *handle, int err)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index c8c6885406db..2877258d9497 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)
 #define EXT4_HT_MIGRATE          8
 #define EXT4_HT_MOVE_EXTENTS     9
 #define EXT4_HT_XATTR           10
-#define EXT4_HT_MAX             11
+#define EXT4_HT_EXT_CONVERT     11
+#define EXT4_HT_MAX             12
 /**
 *   struct ext4_journal_cb_entry - Base structure for callback information.
@@ -265,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
        __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
 handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
-                                  int type, int nblocks);
+                                  int type, int blocks, int rsv_blocks);
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
 #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -300,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
 }
 #define ext4_journal_start_sb(sb, type, nblocks)                        \
-        __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks))
+        __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
 #define ext4_journal_start(inode, type, nblocks)                        \
-        __ext4_journal_start((inode), __LINE__, (type), (nblocks))
+        __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
+        __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
 static inline handle_t *__ext4_journal_start(struct inode *inode,
                                             unsigned int line, int type,
-                                             int nblocks)
+                                             int blocks, int rsv_blocks)
 {
-        return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks);
+        return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+                                       rsv_blocks);
 }
 #define ext4_journal_stop(handle) \
        __ext4_journal_stop(__func__, __LINE__, (handle))
+#define ext4_journal_start_reserved(handle, type) \
+        __ext4_journal_start_reserved((handle), __LINE__, (type))
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+                                        int type);
+static inline void ext4_journal_free_reserved(handle_t *handle)
+{
+        if (ext4_handle_valid(handle))
+                jbd2_journal_free_reserved(handle);
+}
 static inline handle_t *ext4_journal_current_handle(void)
 {
        return journal_current_handle();
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bc0f1910b9cf..72ba4705d4fa 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2125,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
                next_del = ext4_find_delayed_extent(inode, &es);
                if (!exists && next_del) {
                        exists = 1;
-                        flags |= FIEMAP_EXTENT_DELALLOC;
+                        flags |= (FIEMAP_EXTENT_DELALLOC |
+                                  FIEMAP_EXTENT_UNKNOWN);
                }
                up_read(&EXT4_I(inode)->i_data_sem);
@@ -2328,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
 }
 /*
- * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ * How many index/leaf blocks need to change/allocate to add @extents extents?
 *
- * if nrblocks are fit in a single extent (chunk flag is 1), then
+ * If we add a single extent, then in the worse case, each tree level
- * in the worse case, each tree level index/leaf need to be changed
+ * index/leaf need to be changed in case of the tree split.
- * if the tree split due to insert a new extent, then the old tree
- * index/leaf need to be updated too
 *
- * If the nrblocks are discontiguous, they could cause
+ * If more extents are inserted, they could cause the whole tree split more
- * the whole tree split more than once, but this is really rare.
+ * than once, but this is really rare.
 */
-int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
 {
        int index;
        int depth;
@@ -2349,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
        depth = ext_depth(inode);
-        if (chunk)
+        if (extents <= 1)
                index = depth * 2;
        else
                index = depth * 3;
@@ -2357,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
        return index;
 }
+static inline int get_default_free_blocks_flags(struct inode *inode)
+{
+        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+        else if (ext4_should_journal_data(inode))
+                return EXT4_FREE_BLOCKS_FORGET;
+        return 0;
+}
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                              struct ext4_extent *ex,
-                              ext4_fsblk_t *partial_cluster,
+                              long long *partial_cluster,
                              ext4_lblk_t from, ext4_lblk_t to)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
        ext4_fsblk_t pblk;
-        int flags = 0;
+        int flags = get_default_free_blocks_flags(inode);
-        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
-        else if (ext4_should_journal_data(inode))
-                flags |= EXT4_FREE_BLOCKS_FORGET;
        /*
         * For bigalloc file systems, we never free a partial cluster
@@ -2388,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
         * partial cluster here.
         */
        pblk = ext4_ext_pblock(ex) + ee_len - 1;
-        if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+        if ((*partial_cluster > 0) &&
+            (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_C2B(sbi, *partial_cluster),
                                 sbi->s_cluster_ratio, flags);
@@ -2414,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
            && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
                /* tail removal */
                ext4_lblk_t num;
+                unsigned int unaligned;
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
                pblk = ext4_ext_pblock(ex) + ee_len - num;
-                ext_debug("free last %u blocks starting %llu\n", num, pblk);
+                /*
+                 * Usually we want to free partial cluster at the end of the
+                 * extent, except for the situation when the cluster is still
+                 * used by any other extent (partial_cluster is negative).
+                 */
+                if (*partial_cluster < 0 &&
+                    -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+                        flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+                ext_debug("free last %u blocks starting %llu partial %lld\n",
+                          num, pblk, *partial_cluster);
                ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
                /*
                 * If the block range to be freed didn't start at the
                 * beginning of a cluster, and we removed the entire
-                 * extent, save the partial cluster here, since we
+                 * extent and the cluster is not used by any other extent,
-                 * might need to delete if we determine that the
+                 * save the partial cluster here, since we might need to
-                 * truncate operation has removed all of the blocks in
+                 * delete if we determine that the truncate operation has
-                 * the cluster.
+                 * removed all of the blocks in the cluster.
+                 *
+                 * On the other hand, if we did not manage to free the whole
+                 * extent, we have to mark the cluster as used (store negative
+                 * cluster number in partial_cluster).
                 */
-                if (pblk & (sbi->s_cluster_ratio - 1) &&
+                unaligned = pblk & (sbi->s_cluster_ratio - 1);
-                    (ee_len == num))
+                if (unaligned && (ee_len == num) &&
+                    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
                        *partial_cluster = EXT4_B2C(sbi, pblk);
-                else
+                else if (unaligned)
+                        *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
+                else if (*partial_cluster > 0)
                        *partial_cluster = 0;
-        } else if (from == le32_to_cpu(ex->ee_block)
+        } else
-                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
+                ext4_error(sbi->s_sb, "strange request: removal(2) "
-                /* head removal */
+                           "%u-%u from %u:%u\n",
-                ext4_lblk_t num;
+                           from, to, le32_to_cpu(ex->ee_block), ee_len);
-                ext4_fsblk_t start;
-                num = to - from;
-                start = ext4_ext_pblock(ex);
-                ext_debug("free first %u blocks starting %llu\n", num, start);
-                ext4_free_blocks(handle, inode, NULL, start, num, flags);
-        } else {
-                printk(KERN_INFO "strange request: removal(2) "
-                                "%u-%u from %u:%u\n",
-                                from, to, le32_to_cpu(ex->ee_block), ee_len);
-        }
        return 0;
 }
@@ -2461,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 * @handle: The journal handle
 * @inode:  The files inode
 * @path:   The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ *                   has been released from it. It gets negative in case
+ *                   that the cluster is still used.
 * @start:  The first block to remove
 * @end:   The last block to remove
 */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-                 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+                 struct ext4_ext_path *path,
+                 long long *partial_cluster,
                 ext4_lblk_t start, ext4_lblk_t end)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2479,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        unsigned short ex_ee_len;
        unsigned uninitialized = 0;
        struct ext4_extent *ex;
+        ext4_fsblk_t pblk;
        /* the header must be checked already in ext4_ext_remove_space() */
        ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2490,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                return -EIO;
        }
        /* find where to start removing */
-        ex = EXT_LAST_EXTENT(eh);
+        ex = path[depth].p_ext;
+        if (!ex)
+                ex = EXT_LAST_EXTENT(eh);
        ex_ee_block = le32_to_cpu(ex->ee_block);
        ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2517,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                /* If this extent is beyond the end of the hole, skip it */
                if (end < ex_ee_block) {
+                        /*
+                         * We're going to skip this extent and move to another,
+                         * so if this extent is not cluster aligned we have
+                         * to mark the current cluster as used to avoid
+                         * accidentally freeing it later on
+                         */
+                        pblk = ext4_ext_pblock(ex);
+                        if (pblk & (sbi->s_cluster_ratio - 1))
+                                *partial_cluster =
+                                        -((long long)EXT4_B2C(sbi, pblk));
                        ex--;
                        ex_ee_block = le32_to_cpu(ex->ee_block);
                        ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2592,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                                        sizeof(struct ext4_extent));
                        }
                        le16_add_cpu(&eh->eh_entries, -1);
-                } else
+                } else if (*partial_cluster > 0)
                        *partial_cluster = 0;
                err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2610,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                err = ext4_ext_correct_indexes(handle, inode, path);
        /*
-         * If there is still a entry in the leaf node, check to see if
+         * Free the partial cluster only if the current extent does not
-         * it references the partial cluster.  This is the only place
+         * reference it. Otherwise we might free used cluster.
-         * where it could; if it doesn't, we can free the cluster.
         */
-        if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+        if (*partial_cluster > 0 &&
            (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
             *partial_cluster)) {
-                int flags = EXT4_FREE_BLOCKS_FORGET;
+                int flags = get_default_free_blocks_flags(inode);
-                if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                        flags |= EXT4_FREE_BLOCKS_METADATA;
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_C2B(sbi, *partial_cluster),
@@ -2664,7 +2686,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
        struct super_block *sb = inode->i_sb;
        int depth = ext_depth(inode);
        struct ext4_ext_path *path = NULL;
-        ext4_fsblk_t partial_cluster = 0;
+        long long partial_cluster = 0;
        handle_t *handle;
        int i = 0, err = 0;
@@ -2676,7 +2698,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                return PTR_ERR(handle);
 again:
-        trace_ext4_ext_remove_space(inode, start, depth);
+        trace_ext4_ext_remove_space(inode, start, end, depth);
        /*
         * Check if we are removing extents inside the extent tree. If that
@@ -2813,6 +2835,9 @@ again:
                                err = -EIO;
                                break;
                        }
+                        /* Yield here to deal with large extent trees.
+                         * Should be a no-op if we did IO above. */
+                        cond_resched();
                        if (WARN_ON(i + 1 > depth)) {
                                err = -EIO;
                                break;
@@ -2844,17 +2869,14 @@ again:
                }
        }
-        trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
+        trace_ext4_ext_remove_space_done(inode, start, end, depth,
-                        path->p_hdr->eh_entries);
+                        partial_cluster, path->p_hdr->eh_entries);
        /* If we still have something in the partial cluster and we have removed
         * even the first extent, then we should free the blocks in the partial
         * cluster as well. */
-        if (partial_cluster && path->p_hdr->eh_entries == 0) {
+        if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
-                int flags = EXT4_FREE_BLOCKS_FORGET;
+                int flags = get_default_free_blocks_flags(inode);
-                if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                        flags |= EXT4_FREE_BLOCKS_METADATA;
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -4242,8 +4264,8 @@ got_allocated_blocks:
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
-                ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
+                ext4_free_blocks(handle, inode, NULL, newblock,
-                                 ext4_ext_get_actual_len(&newex), fb_flags);
+                                 EXT4_C2B(sbi, allocated_clusters), fb_flags);
                goto out2;
        }
@@ -4363,8 +4385,9 @@ out2:
        }
 out3:
-        trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
+        trace_ext4_ext_map_blocks_exit(inode, flags, map,
+                                       err ? err : allocated);
+        ext4_es_lru_add(inode);
        return err ? err : allocated;
 }
@@ -4386,9 +4409,20 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)
        last_block = (inode->i_size + sb->s_blocksize - 1)
                        >> EXT4_BLOCK_SIZE_BITS(sb);
+retry:
        err = ext4_es_remove_extent(inode, last_block,
                                    EXT_MAX_BLOCKS - last_block);
+        if (err == -ENOMEM) {
+                cond_resched();
+                congestion_wait(BLK_RW_ASYNC, HZ/50);
+                goto retry;
+        }
+        if (err) {
+                ext4_std_error(inode->i_sb, err);
+                return;
+        }
        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+        ext4_std_error(inode->i_sb, err);
 }
 static void ext4_falloc_update_inode(struct inode *inode,
@@ -4446,7 +4480,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                return -EOPNOTSUPP;
        if (mode & FALLOC_FL_PUNCH_HOLE)
-                return ext4_punch_hole(file, offset, len);
+                return ext4_punch_hole(inode, offset, len);
        ret = ext4_convert_inline_data(inode);
        if (ret)
@@ -4548,10 +4582,9 @@ retry:
 * function, to convert the fallocated extents after IO is completed.
 * Returns 0 on success.
 */
-int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
-                                    ssize_t len)
+                                   loff_t offset, ssize_t len)
 {
-        handle_t *handle;
        unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
@@ -4566,16 +4599,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
        max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
                      map.m_lblk);
        /*
-         * credits to insert 1 extent into extent tree
+         * This is somewhat ugly but the idea is clear: When transaction is
+         * reserved, everything goes into it. Otherwise we rather start several
+         * smaller transactions for conversion of each extent separately.
         */
-        credits = ext4_chunk_trans_blocks(inode, max_blocks);
+        if (handle) {
+                handle = ext4_journal_start_reserved(handle,
+                                                     EXT4_HT_EXT_CONVERT);
+                if (IS_ERR(handle))
+                        return PTR_ERR(handle);
+                credits = 0;
+        } else {
+                /*
+                 * credits to insert 1 extent into extent tree
+                 */
+                credits = ext4_chunk_trans_blocks(inode, max_blocks);
+        }
        while (ret >= 0 && ret < max_blocks) {
                map.m_lblk += ret;
                map.m_len = (max_blocks -= ret);
-                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+                if (credits) {
-                if (IS_ERR(handle)) {
+                        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-                        ret = PTR_ERR(handle);
+                                                    credits);
-                        break;
+                        if (IS_ERR(handle)) {
+                                ret = PTR_ERR(handle);
+                                break;
+                        }
                }
                ret = ext4_map_blocks(handle, inode, &map,
                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
@@ -4586,10 +4635,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                                     inode->i_ino, map.m_lblk,
                                     map.m_len, ret);
                ext4_mark_inode_dirty(handle, inode);
-                ret2 = ext4_journal_stop(handle);
+                if (credits)
-                if (ret <= 0 || ret2 )
+                        ret2 = ext4_journal_stop(handle);
+                if (ret <= 0 || ret2)
                        break;
        }
+        if (!credits)
+                ret2 = ext4_journal_stop(handle);
        return ret > 0 ? ret2 : ret;
 }
@@ -4659,7 +4711,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
                error = ext4_get_inode_loc(inode, &iloc);
                if (error)
                        return error;
-                physical = iloc.bh->b_blocknr << blockbits;
+                physical = (__u64)iloc.bh->b_blocknr << blockbits;
                offset = EXT4_GOOD_OLD_INODE_SIZE +
                                EXT4_I(inode)->i_extra_isize;
                physical += offset;
@@ -4667,7 +4719,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
                flags |= FIEMAP_EXTENT_DATA_INLINE;
                brelse(iloc.bh);
        } else { /* external block */
-                physical = EXT4_I(inode)->i_file_acl << blockbits;
+                physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
                length = inode->i_sb->s_blocksize;
        }
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e6941e622d31..91cb110da1b4 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -10,6 +10,7 @@
 * Ext4 extents status tree core functions.
 */
 #include <linux/rbtree.h>
+#include <linux/list_sort.h>
 #include "ext4.h"
 #include "extents_status.h"
 #include "ext4_extents.h"
@@ -147,6 +148,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                              ext4_lblk_t end);
 static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
                                       int nr_to_scan);
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+                            struct ext4_inode_info *locked_ei);
 int __init ext4_init_es(void)
 {
@@ -291,7 +294,6 @@ out:
        read_unlock(&EXT4_I(inode)->i_es_lock);
-        ext4_es_lru_add(inode);
        trace_ext4_es_find_delayed_extent_range_exit(inode, es);
 }
@@ -439,7 +441,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                 */
                if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
                        if (in_range(es->es_lblk, ee_block, ee_len)) {
-                                pr_warn("ES insert assertation failed for "
+                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu we can find an extent "
                                        "at block [%d/%d/%llu/%c], but we "
                                        "want to add an delayed/hole extent "
@@ -458,7 +460,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                 */
                if (es->es_lblk < ee_block ||
                    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
-                        pr_warn("ES insert assertation failed for inode: %lu "
+                        pr_warn("ES insert assertion failed for inode: %lu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
@@ -468,7 +470,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                }
                if (ee_status ^ es_status) {
-                        pr_warn("ES insert assertation failed for inode: %lu "
+                        pr_warn("ES insert assertion failed for inode: %lu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
@@ -481,7 +483,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                 * that we don't want to add an written/unwritten extent.
                 */
                if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
-                        pr_warn("ES insert assertation failed for inode: %lu "
+                        pr_warn("ES insert assertion failed for inode: %lu "
                                "can't find an extent at block %d but we want "
                                "to add an written/unwritten extent "
                                "[%d/%d/%llu/%llx]\n", inode->i_ino,
@@ -519,7 +521,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                         * We want to add a delayed/hole extent but this
                         * block has been allocated.
                         */
-                        pr_warn("ES insert assertation failed for inode: %lu "
+                        pr_warn("ES insert assertion failed for inode: %lu "
                                "We can find blocks but we want to add a "
                                "delayed/hole extent [%d/%d/%llu/%llx]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
@@ -527,13 +529,13 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                        return;
                } else if (ext4_es_is_written(es)) {
                        if (retval != es->es_len) {
-                                pr_warn("ES insert assertation failed for "
+                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu retval %d != es_len %d\n",
                                        inode->i_ino, retval, es->es_len);
                                return;
                        }
                        if (map.m_pblk != ext4_es_pblock(es)) {
-                                pr_warn("ES insert assertation failed for "
+                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu m_pblk %llu != "
                                        "es_pblk %llu\n",
                                        inode->i_ino, map.m_pblk,
@@ -549,7 +551,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                }
        } else if (retval == 0) {
                if (ext4_es_is_written(es)) {
-                        pr_warn("ES insert assertation failed for inode: %lu "
+                        pr_warn("ES insert assertion failed for inode: %lu "
                                "We can't find the block but we want to add "
                                "an written extent [%d/%d/%llu/%llx]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
@@ -632,10 +634,8 @@ out:
 }
 /*
- * ext4_es_insert_extent() adds a space to a extent status tree.
+ * ext4_es_insert_extent() adds information to an inode's extent
- *
+ * status tree.
- * ext4_es_insert_extent is called by ext4_da_write_begin and
- * ext4_es_remove_extent.
 *
 * Return 0 on success, error code on failure.
 */
@@ -667,12 +667,17 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
        err = __es_remove_extent(inode, lblk, end);
        if (err != 0)
                goto error;
+retry:
        err = __es_insert_extent(inode, &newes);
+        if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+                                               EXT4_I(inode)))
+                goto retry;
+        if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+                err = 0;
 error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
-        ext4_es_lru_add(inode);
        ext4_es_print_tree(inode);
        return err;
@@ -734,7 +739,6 @@ out:
        read_unlock(&EXT4_I(inode)->i_es_lock);
-        ext4_es_lru_add(inode);
        trace_ext4_es_lookup_extent_exit(inode, es, found);
        return found;
 }
@@ -748,8 +752,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
        struct extent_status orig_es;
        ext4_lblk_t len1, len2;
        ext4_fsblk_t block;
-        int err = 0;
+        int err;
+retry:
+        err = 0;
        es = __es_tree_search(&tree->root, lblk);
        if (!es)
                goto out;
@@ -784,6 +790,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                        if (err) {
                                es->es_lblk = orig_es.es_lblk;
                                es->es_len = orig_es.es_len;
+                                if ((err == -ENOMEM) &&
+                                    __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+                                                     EXT4_I(inode)))
+                                        goto retry;
                                goto out;
                        }
                } else {
@@ -878,38 +888,64 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
                                     EXTENT_STATUS_WRITTEN);
 }
-static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+                                     struct list_head *b)
+{
+        struct ext4_inode_info *eia, *eib;
+        eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+        eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+        if (eia->i_touch_when == eib->i_touch_when)
+                return 0;
+        if (time_after(eia->i_touch_when, eib->i_touch_when))
+                return 1;
+        else
+                return -1;
+}
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+                            struct ext4_inode_info *locked_ei)
 {
-        struct ext4_sb_info *sbi = container_of(shrink,
-                                        struct ext4_sb_info, s_es_shrinker);
        struct ext4_inode_info *ei;
-        struct list_head *cur, *tmp, scanned;
+        struct list_head *cur, *tmp;
-        int nr_to_scan = sc->nr_to_scan;
+        LIST_HEAD(skiped);
        int ret, nr_shrunk = 0;
-        ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+        spin_lock(&sbi->s_es_lru_lock);
-        trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
-        if (!nr_to_scan)
-                return ret;
-        INIT_LIST_HEAD(&scanned);
+        /*
+         * If the inode that is at the head of LRU list is newer than
+         * last_sorted time, that means that we need to sort this list.
+         */
+        ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
+        if (sbi->s_es_last_sorted < ei->i_touch_when) {
+                list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+                sbi->s_es_last_sorted = jiffies;
+        }
-        spin_lock(&sbi->s_es_lru_lock);
        list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
-                list_move_tail(cur, &scanned);
+                /*
+                 * If we have already reclaimed all extents from extent
+                 * status tree, just stop the loop immediately.
+                 */
+                if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+                        break;
                ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
-                read_lock(&ei->i_es_lock);
+                /* Skip the inode that is newer than the last_sorted time */
-                if (ei->i_es_lru_nr == 0) {
+                if (sbi->s_es_last_sorted < ei->i_touch_when) {
-                        read_unlock(&ei->i_es_lock);
+                        list_move_tail(cur, &skiped);
                        continue;
                }
-                read_unlock(&ei->i_es_lock);
+                if (ei->i_es_lru_nr == 0 || ei == locked_ei)
+                        continue;
                write_lock(&ei->i_es_lock);
                ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+                if (ei->i_es_lru_nr == 0)
+                        list_del_init(&ei->i_es_lru);
                write_unlock(&ei->i_es_lock);
                nr_shrunk += ret;
@@ -917,29 +953,50 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
                if (nr_to_scan == 0)
                        break;
        }
-        list_splice_tail(&scanned, &sbi->s_es_lru);
+        /* Move the newer inodes into the tail of the LRU list. */
+        list_splice_tail(&skiped, &sbi->s_es_lru);
        spin_unlock(&sbi->s_es_lru_lock);
+        if (locked_ei && nr_shrunk == 0)
+                nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
+        return nr_shrunk;
+}
+static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+        struct ext4_sb_info *sbi = container_of(shrink,
+                                        struct ext4_sb_info, s_es_shrinker);
+        int nr_to_scan = sc->nr_to_scan;
+        int ret, nr_shrunk;
+        ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+        trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+        if (!nr_to_scan)
+                return ret;
+        nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
        ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
        trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
        return ret;
 }
-void ext4_es_register_shrinker(struct super_block *sb)
+void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 {
-        struct ext4_sb_info *sbi;
-        sbi = EXT4_SB(sb);
        INIT_LIST_HEAD(&sbi->s_es_lru);
        spin_lock_init(&sbi->s_es_lru_lock);
+        sbi->s_es_last_sorted = 0;
        sbi->s_es_shrinker.shrink = ext4_es_shrink;
        sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
        register_shrinker(&sbi->s_es_shrinker);
 }
-void ext4_es_unregister_shrinker(struct super_block *sb)
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
 {
-        unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker);
+        unregister_shrinker(&sbi->s_es_shrinker);
 }
 void ext4_es_lru_add(struct inode *inode)
@@ -947,11 +1004,14 @@ void ext4_es_lru_add(struct inode *inode)
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        ei->i_touch_when = jiffies;
+        if (!list_empty(&ei->i_es_lru))
+                return;
        spin_lock(&sbi->s_es_lru_lock);
        if (list_empty(&ei->i_es_lru))
                list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
-        else
-                list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
        spin_unlock(&sbi->s_es_lru_lock);
 }
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f740eb03b707..e936730cc5b0 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -39,6 +39,7 @@
                                 EXTENT_STATUS_DELAYED | \
                                 EXTENT_STATUS_HOLE)
+struct ext4_sb_info;
 struct ext4_extent;
 struct extent_status {
@@ -119,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es,
        es->es_pblk = block;
 }
-extern void ext4_es_register_shrinker(struct super_block *sb);
+extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
-extern void ext4_es_unregister_shrinker(struct super_block *sb);
+extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_lru_add(struct inode *inode);
 extern void ext4_es_lru_del(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b1b4d51b5d86..6f4cc567c382 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -312,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
        blkbits = inode->i_sb->s_blocksize_bits;
        startoff = *offset;
        lastoff = startoff;
-        endoff = (map->m_lblk + map->m_len) << blkbits;
+        endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
        index = startoff >> PAGE_CACHE_SHIFT;
        end = endoff >> PAGE_CACHE_SHIFT;
@@ -457,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
                        if (last != start)
-                                dataoff = last << blkbits;
+                                dataoff = (loff_t)last << blkbits;
                        break;
                }
@@ -468,7 +468,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
                ext4_es_find_delayed_extent_range(inode, last, last, &es);
                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
                        if (last != start)
-                                dataoff = last << blkbits;
+                                dataoff = (loff_t)last << blkbits;
                        break;
                }
@@ -486,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
                }
                last++;
-                dataoff = last << blkbits;
+                dataoff = (loff_t)last << blkbits;
        } while (last <= end);
        mutex_unlock(&inode->i_mutex);
@@ -494,17 +494,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
        if (dataoff > isize)
                return -ENXIO;
-        if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+        return vfs_setpos(file, dataoff, maxsize);
-                return -EINVAL;
-        if (dataoff > maxsize)
-                return -EINVAL;
-        if (dataoff != file->f_pos) {
-                file->f_pos = dataoff;
-                file->f_version = 0;
-        }
-        return dataoff;
 }
 /*
@@ -540,7 +530,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
                        last += ret;
-                        holeoff = last << blkbits;
+                        holeoff = (loff_t)last << blkbits;
                        continue;
                }
@@ -551,7 +541,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
                ext4_es_find_delayed_extent_range(inode, last, last, &es);
                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
                        last = es.es_lblk + es.es_len;
-                        holeoff = last << blkbits;
+                        holeoff = (loff_t)last << blkbits;
                        continue;
                }
@@ -566,7 +556,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
                                                              &map, &holeoff);
                        if (!unwritten) {
                                last += ret;
-                                holeoff = last << blkbits;
+                                holeoff = (loff_t)last << blkbits;
                                continue;
                        }
                }
@@ -580,17 +570,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
        if (holeoff > isize)
                holeoff = isize;
-        if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+        return vfs_setpos(file, holeoff, maxsize);
-                return -EINVAL;
-        if (holeoff > maxsize)
-                return -EINVAL;
-        if (holeoff != file->f_pos) {
-                file->f_pos = holeoff;
-                file->f_version = 0;
-        }
-        return holeoff;
 }
 /*
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e0ba8a408def..a8bc47f75fa0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode)
        return ret;
 }
-/**
- * __sync_file - generic_file_fsync without the locking and filemap_write
- * @inode:      inode to sync
- * @datasync:   only sync essential metadata if true
- *
- * This is just generic_file_fsync without the locking.  This is needed for
- * nojournal mode to make sure this inodes data/metadata makes it to disk
- * properly.  The i_mutex should be held already.
- */
-static int __sync_inode(struct inode *inode, int datasync)
-{
-        int err;
-        int ret;
-        ret = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return ret;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return ret;
-        err = sync_inode_metadata(inode, 1);
-        if (ret == 0)
-                ret = err;
-        return ret;
-}
 /*
 * akpm: A new design for ext4_sync_file().
 *
@@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        struct inode *inode = file->f_mapping->host;
        struct ext4_inode_info *ei = EXT4_I(inode);
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-        int ret, err;
+        int ret = 0, err;
        tid_t commit_tid;
        bool needs_barrier = false;
@@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        trace_ext4_sync_file_enter(file, datasync);
-        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        if (inode->i_sb->s_flags & MS_RDONLY) {
-        if (ret)
+                /* Make sure that we read updated s_mount_flags value */
-                return ret;
+                smp_rmb();
-        mutex_lock(&inode->i_mutex);
+                if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+                        ret = -EROFS;
-        if (inode->i_sb->s_flags & MS_RDONLY)
-                goto out;
-        ret = ext4_flush_unwritten_io(inode);
-        if (ret < 0)
                goto out;
+        }
        if (!journal) {
-                ret = __sync_inode(inode, datasync);
+                ret = generic_file_fsync(file, start, end, datasync);
                if (!ret && !hlist_empty(&inode->i_dentry))
                        ret = ext4_sync_parent(inode);
                goto out;
        }
+        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        if (ret)
+                return ret;
        /*
         * data=writeback,ordered:
         *  The caller's filemap_fdatawrite()/wait will sync the data.
@@ -172,8 +145,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                if (!ret)
                        ret = err;
        }
- out:
+out:
-        mutex_unlock(&inode->i_mutex);
        trace_ext4_sync_file_exit(inode, ret);
        return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00a818d67b54..8bf5999875ee 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -734,11 +734,8 @@ repeat_in_this_group:
                ino = ext4_find_next_zero_bit((unsigned long *)
                                              inode_bitmap_bh->b_data,
                                              EXT4_INODES_PER_GROUP(sb), ino);
-                if (ino >= EXT4_INODES_PER_GROUP(sb)) {
+                if (ino >= EXT4_INODES_PER_GROUP(sb))
-                        if (++group == ngroups)
+                        goto next_group;
-                                group = 0;
-                        continue;
-                }
                if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
                        ext4_error(sb, "reserved inode found cleared - "
                                   "inode=%lu", ino + 1);
@@ -747,7 +744,8 @@ repeat_in_this_group:
                if (!handle) {
                        BUG_ON(nblocks <= 0);
                        handle = __ext4_journal_start_sb(dir->i_sb, line_no,
-                                                         handle_type, nblocks);
+                                                         handle_type, nblocks,
+                                                         0);
                        if (IS_ERR(handle)) {
                                err = PTR_ERR(handle);
                                ext4_std_error(sb, err);
@@ -768,6 +766,9 @@ repeat_in_this_group:
                        goto got; /* we grabbed the inode! */
                if (ino < EXT4_INODES_PER_GROUP(sb))
                        goto repeat_in_this_group;
+next_group:
+                if (++group == ngroups)
+                        group = 0;
        }
        err = -ENOSPC;
        goto out;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8d5d351e24f..87b30cd357e7 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -624,7 +624,7 @@ cleanup:
                partial--;
        }
 out:
-        trace_ext4_ind_map_blocks_exit(inode, map, err);
+        trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
        return err;
 }
@@ -675,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 retry:
        if (rw == READ && ext4_should_dioread_nolock(inode)) {
-                if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
-                        mutex_lock(&inode->i_mutex);
-                        ext4_flush_unwritten_io(inode);
-                        mutex_unlock(&inode->i_mutex);
-                }
                /*
                 * Nolock dioread optimization may be dynamically disabled
                 * via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -779,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
        return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
-int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+/*
+ * Calculate number of indirect blocks touched by mapping @nrblocks logically
+ * contiguous blocks
+ */
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
 {
-        int indirects;
-        /* if nrblocks are contiguous */
-        if (chunk) {
-                /*
-                 * With N contiguous data blocks, we need at most
-                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
-                 * 2 dindirect blocks, and 1 tindirect block
-                 */
-                return DIV_ROUND_UP(nrblocks,
-                                    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
-        }
        /*
-         * if nrblocks are not contiguous, worse case, each block touch
+         * With N contiguous data blocks, we need at most
-         * a indirect block, and each indirect block touch a double indirect
+         * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
-         * block, plus a triple indirect block
+         * 2 dindirect blocks, and 1 tindirect block
         */
-        indirects = nrblocks * 2 + 1;
+        return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
-        return indirects;
 }
 /*
@@ -940,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
                             __le32 *last)
 {
        __le32 *p;
-        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+        int     flags = EXT4_FREE_BLOCKS_VALIDATED;
        int     err;
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                flags |= EXT4_FREE_BLOCKS_METADATA;
+                flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
+        else if (ext4_should_journal_data(inode))
+                flags |= EXT4_FREE_BLOCKS_FORGET;
        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
                                   count)) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3e2bf873e8a8..d9ecbf1113a7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -72,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
                entry = (struct ext4_xattr_entry *)
                        ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
-                free += le32_to_cpu(entry->e_value_size);
+                free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
                goto out;
        }
@@ -1404,16 +1404,15 @@ out:
 * offset as if '.' and '..' really take place.
 *
 */
-int ext4_read_inline_dir(struct file *filp,
+int ext4_read_inline_dir(struct file *file,
-                         void *dirent, filldir_t filldir,
+                         struct dir_context *ctx,
                         int *has_inline_data)
 {
-        int error = 0;
        unsigned int offset, parent_ino;
-        int i, stored;
+        int i;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        int ret, inline_size = 0;
        struct ext4_iloc iloc;
        void *dir_buf = NULL;
@@ -1444,9 +1443,8 @@ int ext4_read_inline_dir(struct file *filp,
                goto out;
        sb = inode->i_sb;
-        stored = 0;
        parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
-        offset = filp->f_pos;
+        offset = ctx->pos;
        /*
         * dotdot_offset and dotdot_size is the real offset and
@@ -1460,104 +1458,74 @@ int ext4_read_inline_dir(struct file *filp,
        extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
        extra_size = extra_offset + inline_size;
-        while (!error && !stored && filp->f_pos < extra_size) {
+        /*
-revalidate:
+         * If the version has changed since the last call to
-                /*
+         * readdir(2), then we might be pointing to an invalid
-                 * If the version has changed since the last call to
+         * dirent right now.  Scan from the start of the inline
-                 * readdir(2), then we might be pointing to an invalid
+         * dir to make sure.
-                 * dirent right now.  Scan from the start of the inline
+         */
-                 * dir to make sure.
+        if (file->f_version != inode->i_version) {
-                 */
+                for (i = 0; i < extra_size && i < offset;) {
-                if (filp->f_version != inode->i_version) {
+                        /*
-                        for (i = 0; i < extra_size && i < offset;) {
+                         * "." is with offset 0 and
-                                /*
+                         * ".." is dotdot_offset.
-                                 * "." is with offset 0 and
+                         */
-                                 * ".." is dotdot_offset.
+                        if (!i) {
-                                 */
+                                i = dotdot_offset;
-                                if (!i) {
+                                continue;
-                                        i = dotdot_offset;
+                        } else if (i == dotdot_offset) {
-                                        continue;
+                                i = dotdot_size;
-                                } else if (i == dotdot_offset) {
-                                        i = dotdot_size;
-                                        continue;
-                                }
-                                /* for other entry, the real offset in
-                                 * the buf has to be tuned accordingly.
-                                 */
-                                de = (struct ext4_dir_entry_2 *)
-                                        (dir_buf + i - extra_offset);
-                                /* It's too expensive to do a full
-                                 * dirent test each time round this
-                                 * loop, but we do have to test at
-                                 * least that it is non-zero.  A
-                                 * failure will be detected in the
-                                 * dirent test below. */
-                                if (ext4_rec_len_from_disk(de->rec_len,
-                                        extra_size) < EXT4_DIR_REC_LEN(1))
-                                        break;
-                                i += ext4_rec_len_from_disk(de->rec_len,
-                                                            extra_size);
-                        }
-                        offset = i;
-                        filp->f_pos = offset;
-                        filp->f_version = inode->i_version;
-                }
-                while (!error && filp->f_pos < extra_size) {
-                        if (filp->f_pos == 0) {
-                                error = filldir(dirent, ".", 1, 0, inode->i_ino,
-                                                DT_DIR);
-                                if (error)
-                                        break;
-                                stored++;
-                                filp->f_pos = dotdot_offset;
                                continue;
                        }
+                        /* for other entry, the real offset in
+                         * the buf has to be tuned accordingly.
+                         */
+                        de = (struct ext4_dir_entry_2 *)
+                                (dir_buf + i - extra_offset);
+                        /* It's too expensive to do a full
+                         * dirent test each time round this
+                         * loop, but we do have to test at
+                         * least that it is non-zero.  A
+                         * failure will be detected in the
+                         * dirent test below. */
+                        if (ext4_rec_len_from_disk(de->rec_len, extra_size)
+                                < EXT4_DIR_REC_LEN(1))
+                                break;
+                        i += ext4_rec_len_from_disk(de->rec_len,
+                                                    extra_size);
+                }
+                offset = i;
+                ctx->pos = offset;
+                file->f_version = inode->i_version;
+        }
-                        if (filp->f_pos == dotdot_offset) {
+        while (ctx->pos < extra_size) {
-                                error = filldir(dirent, "..", 2,
+                if (ctx->pos == 0) {
-                                                dotdot_offset,
+                        if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
-                                                parent_ino, DT_DIR);
+                                goto out;
-                                if (error)
+                        ctx->pos = dotdot_offset;
-                                        break;
+                        continue;
-                                stored++;
+                }
-                                filp->f_pos = dotdot_size;
+                if (ctx->pos == dotdot_offset) {
-                                continue;
+                        if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
-                        }
+                                goto out;
+                        ctx->pos = dotdot_size;
+                        continue;
+                }
-                        de = (struct ext4_dir_entry_2 *)
+                de = (struct ext4_dir_entry_2 *)
-                                (dir_buf + filp->f_pos - extra_offset);
+                        (dir_buf + ctx->pos - extra_offset);
-                        if (ext4_check_dir_entry(inode, filp, de,
+                if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
-                                                 iloc.bh, dir_buf,
+                                         extra_size, ctx->pos))
-                                                 extra_size, filp->f_pos)) {
+                        goto out;
-                                ret = stored;
+                if (le32_to_cpu(de->inode)) {
+                        if (!dir_emit(ctx, de->name, de->name_len,
+                                      le32_to_cpu(de->inode),
+                                      get_dtype(sb, de->file_type)))
                                goto out;
-                        }
-                        if (le32_to_cpu(de->inode)) {
-                                /* We might block in the next section
-                                 * if the data destination is
-                                 * currently swapped out.  So, use a
-                                 * version stamp to detect whether or
-                                 * not the directory has been modified
-                                 * during the copy operation.
-                                 */
-                                u64 version = filp->f_version;
-                                error = filldir(dirent, de->name,
-                                                de->name_len,
-                                                filp->f_pos,
-                                                le32_to_cpu(de->inode),
-                                                get_dtype(sb, de->file_type));
-                                if (error)
-                                        break;
-                                if (version != filp->f_version)
-                                        goto revalidate;
-                                stored++;
-                        }
-                        filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
-                                                              extra_size);
                }
+                ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
        }
 out:
        kfree(dir_buf);
@@ -1842,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode,
        if (error)
                goto out;
-        physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+        physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
        physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
        physical += offsetof(struct ext4_inode, i_block);
        length = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d6382b89ecbd..dd32a2eacd0d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                                   new_size);
 }
-static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+                                unsigned int length);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
-                struct inode *inode, struct page *page, loff_t from,
+                                  int pextents);
-                loff_t length, int flags);
 /*
 * Test whether an inode is a fast symlink.
@@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode)
                        filemap_write_and_wait(&inode->i_data);
                }
                truncate_inode_pages(&inode->i_data, 0);
-                ext4_ioend_shutdown(inode);
+                WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
                goto no_delete;
        }
@@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode)
        if (ext4_should_order_data(inode))
                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages(&inode->i_data, 0);
-        ext4_ioend_shutdown(inode);
+        WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
        if (is_bad_inode(inode))
                goto no_delete;
@@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func,
 #define check_block_validity(inode, map)        \
        __check_block_validity((inode), __func__, __LINE__, (map))
-/*
- * Return the number of contiguous dirty pages in a given inode
- * starting at page frame idx.
- */
-static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
-                                    unsigned int max_pages)
-{
-        struct address_space *mapping = inode->i_mapping;
-        pgoff_t index;
-        struct pagevec pvec;
-        pgoff_t num = 0;
-        int i, nr_pages, done = 0;
-        if (max_pages == 0)
-                return 0;
-        pagevec_init(&pvec, 0);
-        while (!done) {
-                index = idx;
-                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-                                              PAGECACHE_TAG_DIRTY,
-                                              (pgoff_t)PAGEVEC_SIZE);
-                if (nr_pages == 0)
-                        break;
-                for (i = 0; i < nr_pages; i++) {
-                        struct page *page = pvec.pages[i];
-                        struct buffer_head *bh, *head;
-                        lock_page(page);
-                        if (unlikely(page->mapping != mapping) ||
-                            !PageDirty(page) ||
-                            PageWriteback(page) ||
-                            page->index != idx) {
-                                done = 1;
-                                unlock_page(page);
-                                break;
-                        }
-                        if (page_has_buffers(page)) {
-                                bh = head = page_buffers(page);
-                                do {
-                                        if (!buffer_delay(bh) &&
-                                            !buffer_unwritten(bh))
-                                                done = 1;
-                                        bh = bh->b_this_page;
-                                } while (!done && (bh != head));
-                        }
-                        unlock_page(page);
-                        if (done)
-                                break;
-                        idx++;
-                        num++;
-                        if (num >= max_pages) {
-                                done = 1;
-                                break;
-                        }
-                }
-                pagevec_release(&pvec);
-        }
-        return num;
-}
 #ifdef ES_AGGRESSIVE_TEST
 static void ext4_map_blocks_es_recheck(handle_t *handle,
                                       struct inode *inode,
@@ -524,7 +465,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
        if (es_map->m_lblk != map->m_lblk ||
            es_map->m_flags != map->m_flags ||
            es_map->m_pblk != map->m_pblk) {
-                printk("ES cache assertation failed for inode: %lu "
+                printk("ES cache assertion failed for inode: %lu "
                       "es_cached ex [%d/%d/%llu/%x] != "
                       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
                       inode->i_ino, es_map->m_lblk, es_map->m_len,
@@ -575,6 +516,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+                ext4_es_lru_add(inode);
                if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
                        map->m_pblk = ext4_es_pblock(&es) +
                                        map->m_lblk - es.es_lblk;
@@ -613,14 +555,13 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                int ret;
                unsigned long long status;
-#ifdef ES_AGGRESSIVE_TEST
+                if (unlikely(retval != map->m_len)) {
-                if (retval != map->m_len) {
+                        ext4_warning(inode->i_sb,
-                        printk("ES len assertation failed for inode: %lu "
+                                     "ES len assertion failed for inode "
-                               "retval %d != map->m_len %d "
+                                     "%lu: retval %d != map->m_len %d",
-                               "in %s (lookup)\n", inode->i_ino, retval,
+                                     inode->i_ino, retval, map->m_len);
-                               map->m_len, __func__);
+                        WARN_ON(1);
                }
-#endif
                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -714,14 +655,13 @@ found:
                int ret;
                unsigned long long status;
-#ifdef ES_AGGRESSIVE_TEST
+                if (unlikely(retval != map->m_len)) {
-                if (retval != map->m_len) {
+                        ext4_warning(inode->i_sb,
-                        printk("ES len assertation failed for inode: %lu "
+                                     "ES len assertion failed for inode "
-                               "retval %d != map->m_len %d "
+                                     "%lu: retval %d != map->m_len %d",
-                               "in %s (allocation)\n", inode->i_ino, retval,
+                                     inode->i_ino, retval, map->m_len);
-                               map->m_len, __func__);
+                        WARN_ON(1);
                }
-#endif
                /*
                 * If the extent has been zeroed out, we don't need to update
@@ -1118,10 +1058,13 @@ static int ext4_write_end(struct file *file,
                }
        }
-        if (ext4_has_inline_data(inode))
+        if (ext4_has_inline_data(inode)) {
-                copied = ext4_write_inline_data_end(inode, pos, len,
+                ret = ext4_write_inline_data_end(inode, pos, len,
-                                                    copied, page);
+                                                 copied, page);
-        else
+                if (ret < 0)
+                        goto errout;
+                copied = ret;
+        } else
                copied = block_write_end(file, mapping, pos,
                                         len, copied, page, fsdata);
@@ -1157,8 +1100,6 @@ static int ext4_write_end(struct file *file,
        if (i_size_changed)
                ext4_mark_inode_dirty(handle, inode);
-        if (copied < 0)
-                ret = copied;
        if (pos + len > inode->i_size && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
@@ -1415,21 +1356,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 }
 static void ext4_da_page_release_reservation(struct page *page,
-                                             unsigned long offset)
+                                             unsigned int offset,
+                                             unsigned int length)
 {
        int to_release = 0;
        struct buffer_head *head, *bh;
        unsigned int curr_off = 0;
        struct inode *inode = page->mapping->host;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        unsigned int stop = offset + length;
        int num_clusters;
        ext4_fsblk_t lblk;
+        BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
        head = page_buffers(page);
        bh = head;
        do {
                unsigned int next_off = curr_off + bh->b_size;
+                if (next_off > stop)
+                        break;
                if ((offset <= curr_off) && (buffer_delay(bh))) {
                        to_release++;
                        clear_buffer_delay(bh);
@@ -1460,140 +1408,43 @@ static void ext4_da_page_release_reservation(struct page *page,
 * Delayed allocation stuff
 */
-/*
+struct mpage_da_data {
- * mpage_da_submit_io - walks through extent of pages and try to write
+        struct inode *inode;
- * them with writepage() call back
+        struct writeback_control *wbc;
- *
- * @mpd->inode: inode
- * @mpd->first_page: first page of the extent
- * @mpd->next_page: page after the last page of the extent
- *
- * By the time mpage_da_submit_io() is called we expect all blocks
- * to be allocated. this may be wrong if allocation failed.
- *
- * As pages are already locked by write_cache_pages(), we can't use it
- */
-static int mpage_da_submit_io(struct mpage_da_data *mpd,
-                              struct ext4_map_blocks *map)
-{
-        struct pagevec pvec;
-        unsigned long index, end;
-        int ret = 0, err, nr_pages, i;
-        struct inode *inode = mpd->inode;
-        struct address_space *mapping = inode->i_mapping;
-        loff_t size = i_size_read(inode);
-        unsigned int len, block_start;
-        struct buffer_head *bh, *page_bufs = NULL;
-        sector_t pblock = 0, cur_logical = 0;
-        struct ext4_io_submit io_submit;
-        BUG_ON(mpd->next_page <= mpd->first_page);
+        pgoff_t first_page;     /* The first page to write */
-        memset(&io_submit, 0, sizeof(io_submit));
+        pgoff_t next_page;      /* Current page to examine */
+        pgoff_t last_page;      /* Last page to examine */
        /*
-         * We need to start from the first_page to the next_page - 1
+         * Extent to map - this can be after first_page because that can be
-         * to make sure we also write the mapped dirty buffer_heads.
+         * fully mapped. We somewhat abuse m_flags to store whether the extent
-         * If we look at mpd->b_blocknr we would only be looking
+         * is delalloc or unwritten.
-         * at the currently mapped buffer_heads.
         */
-        index = mpd->first_page;
+        struct ext4_map_blocks map;
-        end = mpd->next_page - 1;
+        struct ext4_io_submit io_submit;        /* IO submission data */
+};
-        pagevec_init(&pvec, 0);
-        while (index <= end) {
-                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-                if (nr_pages == 0)
-                        break;
-                for (i = 0; i < nr_pages; i++) {
-                        int skip_page = 0;
-                        struct page *page = pvec.pages[i];
-                        index = page->index;
-                        if (index > end)
-                                break;
-                        if (index == size >> PAGE_CACHE_SHIFT)
-                                len = size & ~PAGE_CACHE_MASK;
-                        else
-                                len = PAGE_CACHE_SIZE;
-                        if (map) {
-                                cur_logical = index << (PAGE_CACHE_SHIFT -
-                                                        inode->i_blkbits);
-                                pblock = map->m_pblk + (cur_logical -
-                                                        map->m_lblk);
-                        }
-                        index++;
-                        BUG_ON(!PageLocked(page));
-                        BUG_ON(PageWriteback(page));
-                        bh = page_bufs = page_buffers(page);
-                        block_start = 0;
-                        do {
-                                if (map && (cur_logical >= map->m_lblk) &&
-                                    (cur_logical <= (map->m_lblk +
-                                                     (map->m_len - 1)))) {
-                                        if (buffer_delay(bh)) {
-                                                clear_buffer_delay(bh);
-                                                bh->b_blocknr = pblock;
-                                        }
-                                        if (buffer_unwritten(bh) ||
-                                            buffer_mapped(bh))
-                                                BUG_ON(bh->b_blocknr != pblock);
-                                        if (map->m_flags & EXT4_MAP_UNINIT)
-                                                set_buffer_uninit(bh);
-                                        clear_buffer_unwritten(bh);
-                                }
-                                /*
-                                 * skip page if block allocation undone and
-                                 * block is dirty
-                                 */
-                                if (ext4_bh_delay_or_unwritten(NULL, bh))
-                                        skip_page = 1;
-                                bh = bh->b_this_page;
-                                block_start += bh->b_size;
-                                cur_logical++;
-                                pblock++;
-                        } while (bh != page_bufs);
-                        if (skip_page) {
-                                unlock_page(page);
-                                continue;
-                        }
-                        clear_page_dirty_for_io(page);
-                        err = ext4_bio_write_page(&io_submit, page, len,
-                                                  mpd->wbc);
-                        if (!err)
-                                mpd->pages_written++;
-                        /*
-                         * In error case, we have to continue because
-                         * remaining pages are still locked
-                         */
-                        if (ret == 0)
-                                ret = err;
-                }
-                pagevec_release(&pvec);
-        }
-        ext4_io_submit(&io_submit);
-        return ret;
-}
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+                                       bool invalidate)
 {
        int nr_pages, i;
        pgoff_t index, end;
        struct pagevec pvec;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
-        ext4_lblk_t start, last;
+        /* This is necessary when next_page == 0. */
+        if (mpd->first_page >= mpd->next_page)
+                return;
        index = mpd->first_page;
        end   = mpd->next_page - 1;
+        if (invalidate) {
-        start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+                ext4_lblk_t start, last;
-        last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+                start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        ext4_es_remove_extent(inode, start, last - start + 1);
+                last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+                ext4_es_remove_extent(inode, start, last - start + 1);
+        }
        pagevec_init(&pvec, 0);
        while (index <= end) {
@@ -1606,14 +1457,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
                                break;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
-                        block_invalidatepage(page, 0);
+                        if (invalidate) {
-                        ClearPageUptodate(page);
+                                block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+                                ClearPageUptodate(page);
+                        }
                        unlock_page(page);
                }
                index = pvec.pages[nr_pages - 1]->index + 1;
                pagevec_release(&pvec);
        }
-        return;
 }
 static void ext4_print_free_blocks(struct inode *inode)
@@ -1642,215 +1494,6 @@ static void ext4_print_free_blocks(struct inode *inode)
        return;
 }
-/*
- * mpage_da_map_and_submit - go through given space, map them
- *       if necessary, and then submit them for I/O
- *
- * @mpd - bh describing space
- *
- * The function skips space we know is already mapped to disk blocks.
- *
- */
-static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
-{
-        int err, blks, get_blocks_flags;
-        struct ext4_map_blocks map, *mapp = NULL;
-        sector_t next = mpd->b_blocknr;
-        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
-        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
-        handle_t *handle = NULL;
-        /*
-         * If the blocks are mapped already, or we couldn't accumulate
-         * any blocks, then proceed immediately to the submission stage.
-         */
-        if ((mpd->b_size == 0) ||
-            ((mpd->b_state  & (1 << BH_Mapped)) &&
-             !(mpd->b_state & (1 << BH_Delay)) &&
-             !(mpd->b_state & (1 << BH_Unwritten))))
-                goto submit_io;
-        handle = ext4_journal_current_handle();
-        BUG_ON(!handle);
-        /*
-         * Call ext4_map_blocks() to allocate any delayed allocation
-         * blocks, or to convert an uninitialized extent to be
-         * initialized (in the case where we have written into
-         * one or more preallocated blocks).
-         *
-         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
-         * indicate that we are on the delayed allocation path.  This
-         * affects functions in many different parts of the allocation
-         * call path.  This flag exists primarily because we don't
-         * want to change *many* call functions, so ext4_map_blocks()
-         * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
-         * inode's allocation semaphore is taken.
-         *
-         * If the blocks in questions were delalloc blocks, set
-         * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
-         * variables are updated after the blocks have been allocated.
-         */
-        map.m_lblk = next;
-        map.m_len = max_blocks;
-        /*
-         * We're in delalloc path and it is possible that we're going to
-         * need more metadata blocks than previously reserved. However
-         * we must not fail because we're in writeback and there is
-         * nothing we can do about it so it might result in data loss.
-         * So use reserved blocks to allocate metadata if possible.
-         */
-        get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
-                           EXT4_GET_BLOCKS_METADATA_NOFAIL;
-        if (ext4_should_dioread_nolock(mpd->inode))
-                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
-        if (mpd->b_state & (1 << BH_Delay))
-                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-        blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
-        if (blks < 0) {
-                struct super_block *sb = mpd->inode->i_sb;
-                err = blks;
-                /*
-                 * If get block returns EAGAIN or ENOSPC and there
-                 * appears to be free blocks we will just let
-                 * mpage_da_submit_io() unlock all of the pages.
-                 */
-                if (err == -EAGAIN)
-                        goto submit_io;
-                if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
-                        mpd->retval = err;
-                        goto submit_io;
-                }
-                /*
-                 * get block failure will cause us to loop in
-                 * writepages, because a_ops->writepage won't be able
-                 * to make progress. The page will be redirtied by
-                 * writepage and writepages will again try to write
-                 * the same.
-                 */
-                if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
-                        ext4_msg(sb, KERN_CRIT,
-                                 "delayed block allocation failed for inode %lu "
-                                 "at logical offset %llu with max blocks %zd "
-                                 "with error %d", mpd->inode->i_ino,
-                                 (unsigned long long) next,
-                                 mpd->b_size >> mpd->inode->i_blkbits, err);
-                        ext4_msg(sb, KERN_CRIT,
-                                "This should not happen!! Data will be lost");
-                        if (err == -ENOSPC)
-                                ext4_print_free_blocks(mpd->inode);
-                }
-                /* invalidate all the pages */
-                ext4_da_block_invalidatepages(mpd);
-                /* Mark this page range as having been completed */
-                mpd->io_done = 1;
-                return;
-        }
-        BUG_ON(blks == 0);
-        mapp = &map;
-        if (map.m_flags & EXT4_MAP_NEW) {
-                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
-                int i;
-                for (i = 0; i < map.m_len; i++)
-                        unmap_underlying_metadata(bdev, map.m_pblk + i);
-        }
-        /*
-         * Update on-disk size along with block allocation.
-         */
-        disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
-        if (disksize > i_size_read(mpd->inode))
-                disksize = i_size_read(mpd->inode);
-        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
-                ext4_update_i_disksize(mpd->inode, disksize);
-                err = ext4_mark_inode_dirty(handle, mpd->inode);
-                if (err)
-                        ext4_error(mpd->inode->i_sb,
-                                   "Failed to mark inode %lu dirty",
-                                   mpd->inode->i_ino);
-        }
-submit_io:
-        mpage_da_submit_io(mpd, mapp);
-        mpd->io_done = 1;
-}
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
-                (1 << BH_Delay) | (1 << BH_Unwritten))
-/*
- * mpage_add_bh_to_extent - try to add one more block to extent of blocks
- *
- * @mpd->lbh - extent of blocks
- * @logical - logical number of the block in the file
- * @b_state - b_state of the buffer head added
- *
- * the function is used to collect contig. blocks in same state
- */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
-                                   unsigned long b_state)
-{
-        sector_t next;
-        int blkbits = mpd->inode->i_blkbits;
-        int nrblocks = mpd->b_size >> blkbits;
-        /*
-         * XXX Don't go larger than mballoc is willing to allocate
-         * This is a stopgap solution.  We eventually need to fold
-         * mpage_da_submit_io() into this function and then call
-         * ext4_map_blocks() multiple times in a loop
-         */
-        if (nrblocks >= (8*1024*1024 >> blkbits))
-                goto flush_it;
-        /* check if the reserved journal credits might overflow */
-        if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
-                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
-                        /*
-                         * With non-extent format we are limited by the journal
-                         * credit available.  Total credit needed to insert
-                         * nrblocks contiguous blocks is dependent on the
-                         * nrblocks.  So limit nrblocks.
-                         */
-                        goto flush_it;
-                }
-        }
-        /*
-         * First block in the extent
-         */
-        if (mpd->b_size == 0) {
-                mpd->b_blocknr = logical;
-                mpd->b_size = 1 << blkbits;
-                mpd->b_state = b_state & BH_FLAGS;
-                return;
-        }
-        next = mpd->b_blocknr + nrblocks;
-        /*
-         * Can we merge the block to our big extent?
-         */
-        if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
-                mpd->b_size += 1 << blkbits;
-                return;
-        }
-flush_it:
-        /*
-         * We couldn't merge the block to our extent, so we
-         * need to flush current  extent and start new one
-         */
-        mpage_da_map_and_submit(mpd);
-        return;
-}
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 {
        return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
@@ -1885,7 +1528,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, iblock, &es)) {
+                ext4_es_lru_add(inode);
                if (ext4_es_is_hole(&es)) {
                        retval = 0;
                        down_read((&EXT4_I(inode)->i_data_sem));
@@ -1992,14 +1635,13 @@ add_delayed:
                int ret;
                unsigned long long status;
-#ifdef ES_AGGRESSIVE_TEST
+                if (unlikely(retval != map->m_len)) {
-                if (retval != map->m_len) {
+                        ext4_warning(inode->i_sb,
-                        printk("ES len assertation failed for inode: %lu "
+                                     "ES len assertion failed for inode "
-                               "retval %d != map->m_len %d "
+                                     "%lu: retval %d != map->m_len %d",
-                               "in %s (lookup)\n", inode->i_ino, retval,
+                                     inode->i_ino, retval, map->m_len);
-                               map->m_len, __func__);
+                        WARN_ON(1);
                }
-#endif
                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -2156,7 +1798,7 @@ out:
 * lock so we have to do some magic.
 *
 * This function can get called via...
- *   - ext4_da_writepages after taking page lock (have journal handle)
+ *   - ext4_writepages after taking page lock (have journal handle)
 *   - journal_submit_inode_data_buffers (no journal handle)
 *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
 *   - grab_page_cache when doing write_begin (have journal handle)
@@ -2234,76 +1876,405 @@ static int ext4_writepage(struct page *page,
                 */
                return __ext4_journalled_writepage(page, len);
-        memset(&io_submit, 0, sizeof(io_submit));
+        ext4_io_submit_init(&io_submit, wbc);
+        io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+        if (!io_submit.io_end) {
+                redirty_page_for_writepage(wbc, page);
+                unlock_page(page);
+                return -ENOMEM;
+        }
        ret = ext4_bio_write_page(&io_submit, page, len, wbc);
        ext4_io_submit(&io_submit);
+        /* Drop io_end reference we got from init */
+        ext4_put_io_end_defer(io_submit.io_end);
        return ret;
 }
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
 /*
- * This is called via ext4_da_writepages() to
+ * mballoc gives us at most this number of blocks...
- * calculate the total number of credits to reserve to fit
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
- * a single extent allocation into a single transaction,
+ * The rest of mballoc seems to handle chunks upto full group size.
- * ext4_da_writpeages() will loop calling this before
- * the block allocation.
 */
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @b_state - b_state of the buffer head added
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+                                  unsigned long b_state)
 {
-        int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+        struct ext4_map_blocks *map = &mpd->map;
+        /* Don't go larger than mballoc is willing to allocate */
+        if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+                return 0;
+        /* First block in the extent? */
+        if (map->m_len == 0) {
+                map->m_lblk = lblk;
+                map->m_len = 1;
+                map->m_flags = b_state & BH_FLAGS;
+                return 1;
+        }
+        /* Can we merge the block to our big extent? */
+        if (lblk == map->m_lblk + map->m_len &&
+            (b_state & BH_FLAGS) == map->m_flags) {
+                map->m_len++;
+                return 1;
+        }
+        return 0;
+}
+static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
+                                    struct buffer_head *head,
+                                    struct buffer_head *bh,
+                                    ext4_lblk_t lblk)
+{
+        struct inode *inode = mpd->inode;
+        ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+                                                        >> inode->i_blkbits;
+        do {
+                BUG_ON(buffer_locked(bh));
+                if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+                    (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
+                    lblk >= blocks) {
+                        /* Found extent to map? */
+                        if (mpd->map.m_len)
+                                return false;
+                        if (lblk >= blocks)
+                                return true;
+                        continue;
+                }
+                if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
+                        return false;
+        } while (lblk++, (bh = bh->b_this_page) != head);
+        return true;
+}
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+        int len;
+        loff_t size = i_size_read(mpd->inode);
+        int err;
+        BUG_ON(page->index != mpd->first_page);
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        clear_page_dirty_for_io(page);
+        err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+        if (!err)
+                mpd->wbc->nr_to_write--;
+        mpd->first_page++;
+        return err;
+}
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ *                     submit fully mapped pages for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ *
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to uninitialized extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+        struct pagevec pvec;
+        int nr_pages, i;
+        struct inode *inode = mpd->inode;
+        struct buffer_head *head, *bh;
+        int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+        ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+                                                        >> inode->i_blkbits;
+        pgoff_t start, end;
+        ext4_lblk_t lblk;
+        sector_t pblock;
+        int err;
+        start = mpd->map.m_lblk >> bpp_bits;
+        end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+        lblk = start << bpp_bits;
+        pblock = mpd->map.m_pblk;
+        pagevec_init(&pvec, 0);
+        while (start <= end) {
+                nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+                                          PAGEVEC_SIZE);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        if (page->index > end)
+                                break;
+                        /* Upto 'end' pages must be contiguous */
+                        BUG_ON(page->index != start);
+                        bh = head = page_buffers(page);
+                        do {
+                                if (lblk < mpd->map.m_lblk)
+                                        continue;
+                                if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+                                        /*
+                                         * Buffer after end of mapped extent.
+                                         * Find next buffer in the page to map.
+                                         */
+                                        mpd->map.m_len = 0;
+                                        mpd->map.m_flags = 0;
+                                        add_page_bufs_to_extent(mpd, head, bh,
+                                                                lblk);
+                                        pagevec_release(&pvec);
+                                        return 0;
+                                }
+                                if (buffer_delay(bh)) {
+                                        clear_buffer_delay(bh);
+                                        bh->b_blocknr = pblock++;
+                                }
+                                clear_buffer_unwritten(bh);
+                        } while (++lblk < blocks &&
+                                 (bh = bh->b_this_page) != head);
+                        /*
+                         * FIXME: This is going to break if dioread_nolock
+                         * supports blocksize < pagesize as we will try to
+                         * convert potentially unmapped parts of inode.
+                         */
+                        mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+                        /* Page fully mapped - let IO run! */
+                        err = mpage_submit_page(mpd, page);
+                        if (err < 0) {
+                                pagevec_release(&pvec);
+                                return err;
+                        }
+                        start++;
+                }
+                pagevec_release(&pvec);
+        }
+        /* Extent fully mapped and matches with page boundary. We are done. */
+        mpd->map.m_len = 0;
+        mpd->map.m_flags = 0;
+        return 0;
+}
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+        struct inode *inode = mpd->inode;
+        struct ext4_map_blocks *map = &mpd->map;
+        int get_blocks_flags;
+        int err;
+        trace_ext4_da_write_pages_extent(inode, map);
        /*
-         * With non-extent format the journal credit needed to
+         * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
-         * insert nrblocks contiguous block is dependent on
+         * to convert an uninitialized extent to be initialized (in the case
-         * number of contiguous block. So we will limit
+         * where we have written into one or more preallocated blocks).  It is
-         * number of contiguous block to a sane value
+         * possible that we're going to need more metadata blocks than
+         * previously reserved. However we must not fail because we're in
+         * writeback and there is nothing we can do about it so it might result
+         * in data loss.  So use reserved blocks to allocate metadata if
+         * possible.
+         *
+         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+         * in question are delalloc blocks.  This affects functions in many
+         * different parts of the allocation call path.  This flag exists
+         * primarily because we don't want to change *many* call functions, so
+         * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+         * once the inode's allocation semaphore is taken.
         */
-        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
+        get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
-            (max_blocks > EXT4_MAX_TRANS_DATA))
+                           EXT4_GET_BLOCKS_METADATA_NOFAIL;
-                max_blocks = EXT4_MAX_TRANS_DATA;
+        if (ext4_should_dioread_nolock(inode))
+                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+        if (map->m_flags & (1 << BH_Delay))
+                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-        return ext4_chunk_trans_blocks(inode, max_blocks);
+        err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+        if (err < 0)
+                return err;
+        if (map->m_flags & EXT4_MAP_UNINIT) {
+                if (!mpd->io_submit.io_end->handle &&
+                    ext4_handle_valid(handle)) {
+                        mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+                        handle->h_rsv_handle = NULL;
+                }
+                ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+        }
+        BUG_ON(map->m_len == 0);
+        if (map->m_flags & EXT4_MAP_NEW) {
+                struct block_device *bdev = inode->i_sb->s_bdev;
+                int i;
+                for (i = 0; i < map->m_len; i++)
+                        unmap_underlying_metadata(bdev, map->m_pblk + i);
+        }
+        return 0;
 }
 /*
- * write_cache_pages_da - walk the list of dirty pages of the given
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
- * address space and accumulate pages that need writing, and call
+ *                               mpd->len and submit pages underlying it for IO
- * mpage_da_map_and_submit to map a single contiguous memory region
+ *
- * and then write them.
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ *
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
 */
-static int write_cache_pages_da(handle_t *handle,
+static int mpage_map_and_submit_extent(handle_t *handle,
-                                struct address_space *mapping,
+                                       struct mpage_da_data *mpd,
-                                struct writeback_control *wbc,
+                                       bool *give_up_on_write)
-                                struct mpage_da_data *mpd,
-                                pgoff_t *done_index)
 {
-        struct buffer_head      *bh, *head;
+        struct inode *inode = mpd->inode;
-        struct inode            *inode = mapping->host;
+        struct ext4_map_blocks *map = &mpd->map;
-        struct pagevec          pvec;
+        int err;
-        unsigned int            nr_pages;
+        loff_t disksize;
-        sector_t                logical;
-        pgoff_t                 index, end;
-        long                    nr_to_write = wbc->nr_to_write;
-        int                     i, tag, ret = 0;
-        memset(mpd, 0, sizeof(struct mpage_da_data));
-        mpd->wbc = wbc;
-        mpd->inode = inode;
-        pagevec_init(&pvec, 0);
-        index = wbc->range_start >> PAGE_CACHE_SHIFT;
-        end = wbc->range_end >> PAGE_CACHE_SHIFT;
-        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+        mpd->io_submit.io_end->offset =
+                                ((loff_t)map->m_lblk) << inode->i_blkbits;
+        do {
+                err = mpage_map_one_extent(handle, mpd);
+                if (err < 0) {
+                        struct super_block *sb = inode->i_sb;
+                        if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+                                goto invalidate_dirty_pages;
+                        /*
+                         * Let the uper layers retry transient errors.
+                         * In the case of ENOSPC, if ext4_count_free_blocks()
+                         * is non-zero, a commit should free up blocks.
+                         */
+                        if ((err == -ENOMEM) ||
+                            (err == -ENOSPC && ext4_count_free_clusters(sb)))
+                                return err;
+                        ext4_msg(sb, KERN_CRIT,
+                                 "Delayed block allocation failed for "
+                                 "inode %lu at logical offset %llu with"
+                                 " max blocks %u with error %d",
+                                 inode->i_ino,
+                                 (unsigned long long)map->m_lblk,
+                                 (unsigned)map->m_len, -err);
+                        ext4_msg(sb, KERN_CRIT,
+                                 "This should not happen!! Data will "
+                                 "be lost\n");
+                        if (err == -ENOSPC)
+                                ext4_print_free_blocks(inode);
+                invalidate_dirty_pages:
+                        *give_up_on_write = true;
+                        return err;
+                }
+                /*
+                 * Update buffer state, submit mapped pages, and get us new
+                 * extent to map
+                 */
+                err = mpage_map_and_submit_buffers(mpd);
+                if (err < 0)
+                        return err;
+        } while (map->m_len);
+        /* Update on-disk size after IO is submitted */
+        disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+        if (disksize > i_size_read(inode))
+                disksize = i_size_read(inode);
+        if (disksize > EXT4_I(inode)->i_disksize) {
+                int err2;
+                ext4_update_i_disksize(inode, disksize);
+                err2 = ext4_mark_inode_dirty(handle, inode);
+                if (err2)
+                        ext4_error(inode->i_sb,
+                                   "Failed to mark inode %lu dirty",
+                                   inode->i_ino);
+                if (!err)
+                        err = err2;
+        }
+        return err;
+}
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+        int bpp = ext4_journal_blocks_per_page(inode);
+        return ext4_meta_trans_blocks(inode,
+                                MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ *                               and underlying extent to map
+ *
+ * @mpd - where to look for pages
+ *
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
+ *
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
+ */
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+{
+        struct address_space *mapping = mpd->inode->i_mapping;
+        struct pagevec pvec;
+        unsigned int nr_pages;
+        pgoff_t index = mpd->first_page;
+        pgoff_t end = mpd->last_page;
+        int tag;
+        int i, err = 0;
+        int blkbits = mpd->inode->i_blkbits;
+        ext4_lblk_t lblk;
+        struct buffer_head *head;
+        if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;
-        *done_index = index;
+        pagevec_init(&pvec, 0);
+        mpd->map.m_len = 0;
+        mpd->next_page = index;
        while (index <= end) {
                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
-                        return 0;
+                        goto out;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
@@ -2318,31 +2289,21 @@ static int write_cache_pages_da(handle_t *handle,
                        if (page->index > end)
                                goto out;
-                        *done_index = page->index + 1;
+                        /* If we can't merge this page, we are done. */
+                        if (mpd->map.m_len > 0 && mpd->next_page != page->index)
-                        /*
+                                goto out;
-                         * If we can't merge this page, and we have
-                         * accumulated an contiguous region, write it
-                         */
-                        if ((mpd->next_page != page->index) &&
-                            (mpd->next_page != mpd->first_page)) {
-                                mpage_da_map_and_submit(mpd);
-                                goto ret_extent_tail;
-                        }
                        lock_page(page);
                        /*
-                         * If the page is no longer dirty, or its
+                         * If the page is no longer dirty, or its mapping no
-                         * mapping no longer corresponds to inode we
+                         * longer corresponds to inode we are writing (which
-                         * are writing (which means it has been
+                         * means it has been truncated or invalidated), or the
-                         * truncated or invalidated), or the page is
+                         * page is already under writeback and we are not doing
-                         * already under writeback and we are not
+                         * a data integrity writeback, skip the page
-                         * doing a data integrity writeback, skip the page
                         */
                        if (!PageDirty(page) ||
                            (PageWriteback(page) &&
-                             (wbc->sync_mode == WB_SYNC_NONE)) ||
+                             (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
                            unlikely(page->mapping != mapping)) {
                                unlock_page(page);
                                continue;
@@ -2351,106 +2312,70 @@ static int write_cache_pages_da(handle_t *handle,
                        wait_on_page_writeback(page);
                        BUG_ON(PageWriteback(page));
-                        /*
+                        if (mpd->map.m_len == 0)
-                         * If we have inline data and arrive here, it means that
-                         * we will soon create the block for the 1st page, so
-                         * we'd better clear the inline data here.
-                         */
-                        if (ext4_has_inline_data(inode)) {
-                                BUG_ON(ext4_test_inode_state(inode,
-                                                EXT4_STATE_MAY_INLINE_DATA));
-                                ext4_destroy_inline_data(handle, inode);
-                        }
-                        if (mpd->next_page != page->index)
                                mpd->first_page = page->index;
                        mpd->next_page = page->index + 1;
-                        logical = (sector_t) page->index <<
-                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
                        /* Add all dirty buffers to mpd */
+                        lblk = ((ext4_lblk_t)page->index) <<
+                                (PAGE_CACHE_SHIFT - blkbits);
                        head = page_buffers(page);
-                        bh = head;
+                        if (!add_page_bufs_to_extent(mpd, head, head, lblk))
-                        do {
+                                goto out;
-                                BUG_ON(buffer_locked(bh));
+                        /* So far everything mapped? Submit the page for IO. */
-                                /*
+                        if (mpd->map.m_len == 0) {
-                                 * We need to try to allocate unmapped blocks
+                                err = mpage_submit_page(mpd, page);
-                                 * in the same page.  Otherwise we won't make
+                                if (err < 0)
-                                 * progress with the page in ext4_writepage
-                                 */
-                                if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-                                        mpage_add_bh_to_extent(mpd, logical,
-                                                               bh->b_state);
-                                        if (mpd->io_done)
-                                                goto ret_extent_tail;
-                                } else if (buffer_dirty(bh) &&
-                                           buffer_mapped(bh)) {
-                                        /*
-                                         * mapped dirty buffer. We need to
-                                         * update the b_state because we look
-                                         * at b_state in mpage_da_map_blocks.
-                                         * We don't update b_size because if we
-                                         * find an unmapped buffer_head later
-                                         * we need to use the b_state flag of
-                                         * that buffer_head.
-                                         */
-                                        if (mpd->b_size == 0)
-                                                mpd->b_state =
-                                                        bh->b_state & BH_FLAGS;
-                                }
-                                logical++;
-                        } while ((bh = bh->b_this_page) != head);
-                        if (nr_to_write > 0) {
-                                nr_to_write--;
-                                if (nr_to_write == 0 &&
-                                    wbc->sync_mode == WB_SYNC_NONE)
-                                        /*
-                                         * We stop writing back only if we are
-                                         * not doing integrity sync. In case of
-                                         * integrity sync we have to keep going
-                                         * because someone may be concurrently
-                                         * dirtying pages, and we might have
-                                         * synced a lot of newly appeared dirty
-                                         * pages, but have not synced all of the
-                                         * old dirty pages.
-                                         */
                                        goto out;
                        }
+                        /*
+                         * Accumulated enough dirty pages? This doesn't apply
+                         * to WB_SYNC_ALL mode. For integrity sync we have to
+                         * keep going because someone may be concurrently
+                         * dirtying pages, and we might have synced a lot of
+                         * newly appeared dirty pages, but have not synced all
+                         * of the old dirty pages.
+                         */
+                        if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
+                            mpd->next_page - mpd->first_page >=
+                                                        mpd->wbc->nr_to_write)
+                                goto out;
                }
                pagevec_release(&pvec);
                cond_resched();
        }
        return 0;
-ret_extent_tail:
-        ret = MPAGE_DA_EXTENT_TAIL;
 out:
        pagevec_release(&pvec);
-        cond_resched();
+        return err;
-        return ret;
 }
+static int __writepage(struct page *page, struct writeback_control *wbc,
+                       void *data)
+{
+        struct address_space *mapping = data;
+        int ret = ext4_writepage(page, wbc);
+        mapping_set_error(mapping, ret);
+        return ret;
+}
-static int ext4_da_writepages(struct address_space *mapping,
+static int ext4_writepages(struct address_space *mapping,
-                              struct writeback_control *wbc)
+                           struct writeback_control *wbc)
 {
-        pgoff_t index;
+        pgoff_t writeback_index = 0;
+        long nr_to_write = wbc->nr_to_write;
        int range_whole = 0;
+        int cycled = 1;
        handle_t *handle = NULL;
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
-        int pages_written = 0;
+        int needed_blocks, rsv_blocks = 0, ret = 0;
-        unsigned int max_pages;
-        int range_cyclic, cycled = 1, io_done = 0;
-        int needed_blocks, ret = 0;
-        long desired_nr_to_write, nr_to_writebump = 0;
-        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
-        pgoff_t done_index = 0;
+        bool done;
-        pgoff_t end;
        struct blk_plug plug;
+        bool give_up_on_write = false;
-        trace_ext4_da_writepages(inode, wbc);
+        trace_ext4_writepages(inode, wbc);
        /*
         * No pages to write? This is mainly a kludge to avoid starting
@@ -2460,164 +2385,165 @@ static int ext4_da_writepages(struct address_space *mapping,
        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;
+        if (ext4_should_journal_data(inode)) {
+                struct blk_plug plug;
+                int ret;
+                blk_start_plug(&plug);
+                ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+                blk_finish_plug(&plug);
+                return ret;
+        }
        /*
         * If the filesystem has aborted, it is read-only, so return
         * right away instead of dumping stack traces later on that
         * will obscure the real source of the problem.  We test
         * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
         * the latter could be true if the filesystem is mounted
-         * read-only, and in that case, ext4_da_writepages should
+         * read-only, and in that case, ext4_writepages should
         * *never* be called, so if that ever happens, we would want
         * the stack trace.
         */
        if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
                return -EROFS;
+        if (ext4_should_dioread_nolock(inode)) {
+                /*
+                 * We may need to convert upto one extent per block in
+                 * the page and we may dirty the inode.
+                 */
+                rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+        }
+        /*
+         * If we have inline data and arrive here, it means that
+         * we will soon create the block for the 1st page, so
+         * we'd better clear the inline data here.
+         */
+        if (ext4_has_inline_data(inode)) {
+                /* Just inode will be modified... */
+                handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        goto out_writepages;
+                }
+                BUG_ON(ext4_test_inode_state(inode,
+                                EXT4_STATE_MAY_INLINE_DATA));
+                ext4_destroy_inline_data(handle, inode);
+                ext4_journal_stop(handle);
+        }
        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                range_whole = 1;
-        range_cyclic = wbc->range_cyclic;
        if (wbc->range_cyclic) {
-                index = mapping->writeback_index;
+                writeback_index = mapping->writeback_index;
-                if (index)
+                if (writeback_index)
                        cycled = 0;
-                wbc->range_start = index << PAGE_CACHE_SHIFT;
+                mpd.first_page = writeback_index;
-                wbc->range_end  = LLONG_MAX;
+                mpd.last_page = -1;
-                wbc->range_cyclic = 0;
-                end = -1;
        } else {
-                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+                mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
-                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+                mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
-        }
-        /*
-         * This works around two forms of stupidity.  The first is in
-         * the writeback code, which caps the maximum number of pages
-         * written to be 1024 pages.  This is wrong on multiple
-         * levels; different architectues have a different page size,
-         * which changes the maximum amount of data which gets
-         * written.  Secondly, 4 megabytes is way too small.  XFS
-         * forces this value to be 16 megabytes by multiplying
-         * nr_to_write parameter by four, and then relies on its
-         * allocator to allocate larger extents to make them
-         * contiguous.  Unfortunately this brings us to the second
-         * stupidity, which is that ext4's mballoc code only allocates
-         * at most 2048 blocks.  So we force contiguous writes up to
-         * the number of dirty blocks in the inode, or
-         * sbi->max_writeback_mb_bump whichever is smaller.
-         */
-        max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-        if (!range_cyclic && range_whole) {
-                if (wbc->nr_to_write == LONG_MAX)
-                        desired_nr_to_write = wbc->nr_to_write;
-                else
-                        desired_nr_to_write = wbc->nr_to_write * 8;
-        } else
-                desired_nr_to_write = ext4_num_dirty_pages(inode, index,
-                                                           max_pages);
-        if (desired_nr_to_write > max_pages)
-                desired_nr_to_write = max_pages;
-        if (wbc->nr_to_write < desired_nr_to_write) {
-                nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
-                wbc->nr_to_write = desired_nr_to_write;
        }
+        mpd.inode = inode;
+        mpd.wbc = wbc;
+        ext4_io_submit_init(&mpd.io_submit, wbc);
 retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-                tag_pages_for_writeback(mapping, index, end);
+                tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+        done = false;
        blk_start_plug(&plug);
-        while (!ret && wbc->nr_to_write > 0) {
+        while (!done && mpd.first_page <= mpd.last_page) {
+                /* For each extent of pages we use new io_end */
+                mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+                if (!mpd.io_submit.io_end) {
+                        ret = -ENOMEM;
+                        break;
+                }
                /*
-                 * we  insert one extent at a time. So we need
+                 * We have two constraints: We find one extent to map and we
-                 * credit needed for single extent allocation.
+                 * must always write out whole page (makes a difference when
-                 * journalled mode is currently not supported
+                 * blocksize < pagesize) so that we don't block on IO when we
-                 * by delalloc
+                 * try to write out the rest of the page. Journalled mode is
+                 * not supported by delalloc.
                 */
                BUG_ON(ext4_should_journal_data(inode));
                needed_blocks = ext4_da_writepages_trans_blocks(inode);
-                /* start a new transaction*/
+                /* start a new transaction */
-                handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+                handle = ext4_journal_start_with_reserve(inode,
-                                            needed_blocks);
+                                EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
-                        blk_finish_plug(&plug);
+                        /* Release allocated io_end */
-                        goto out_writepages;
+                        ext4_put_io_end(mpd.io_submit.io_end);
+                        break;
                }
-                /*
+                trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
-                 * Now call write_cache_pages_da() to find the next
+                ret = mpage_prepare_extent_to_map(&mpd);
-                 * contiguous region of logical blocks that need
+                if (!ret) {
-                 * blocks to be allocated by ext4 and submit them.
+                        if (mpd.map.m_len)
-                 */
+                                ret = mpage_map_and_submit_extent(handle, &mpd,
-                ret = write_cache_pages_da(handle, mapping,
+                                        &give_up_on_write);
-                                           wbc, &mpd, &done_index);
+                        else {
-                /*
+                                /*
-                 * If we have a contiguous extent of pages and we
+                                 * We scanned the whole range (or exhausted
-                 * haven't done the I/O yet, map the blocks and submit
+                                 * nr_to_write), submitted what was mapped and
-                 * them for I/O.
+                                 * didn't find anything needing mapping. We are
-                 */
+                                 * done.
-                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+                                 */
-                        mpage_da_map_and_submit(&mpd);
+                                done = true;
-                        ret = MPAGE_DA_EXTENT_TAIL;
+                        }
                }
-                trace_ext4_da_write_pages(inode, &mpd);
-                wbc->nr_to_write -= mpd.pages_written;
                ext4_journal_stop(handle);
+                /* Submit prepared bio */
-                if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
+                ext4_io_submit(&mpd.io_submit);
-                        /* commit the transaction which would
+                /* Unlock pages we didn't use */
+                mpage_release_unused_pages(&mpd, give_up_on_write);
+                /* Drop our io_end reference we got from init */
+                ext4_put_io_end(mpd.io_submit.io_end);
+                if (ret == -ENOSPC && sbi->s_journal) {
+                        /*
+                         * Commit the transaction which would
                         * free blocks released in the transaction
                         * and try again
                         */
                        jbd2_journal_force_commit_nested(sbi->s_journal);
                        ret = 0;
-                } else if (ret == MPAGE_DA_EXTENT_TAIL) {
+                        continue;
-                        /*
+                }
-                         * Got one extent now try with rest of the pages.
+                /* Fatal error - ENOMEM, EIO... */
-                         * If mpd.retval is set -EIO, journal is aborted.
+                if (ret)
-                         * So we don't need to write any more.
-                         */
-                        pages_written += mpd.pages_written;
-                        ret = mpd.retval;
-                        io_done = 1;
-                } else if (wbc->nr_to_write)
-                        /*
-                         * There is no more writeout needed
-                         * or we requested for a noblocking writeout
-                         * and we found the device congested
-                         */
                        break;
        }
        blk_finish_plug(&plug);
-        if (!io_done && !cycled) {
+        if (!ret && !cycled) {
                cycled = 1;
-                index = 0;
+                mpd.last_page = writeback_index - 1;
-                wbc->range_start = index << PAGE_CACHE_SHIFT;
+                mpd.first_page = 0;
-                wbc->range_end  = mapping->writeback_index - 1;
                goto retry;
        }
        /* Update index */
-        wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
-                 * set the writeback_index so that range_cyclic
+                 * Set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
-                mapping->writeback_index = done_index;
+                mapping->writeback_index = mpd.first_page;
 out_writepages:
-        wbc->nr_to_write -= nr_to_writebump;
+        trace_ext4_writepages_result(inode, wbc, ret,
-        wbc->range_start = range_start;
+                                     nr_to_write - wbc->nr_to_write);
-        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
        return ret;
 }
@@ -2829,7 +2755,8 @@ static int ext4_da_write_end(struct file *file,
        return ret ? ret : copied;
 }
-static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+                                   unsigned int length)
 {
        /*
         * Drop reserved blocks
@@ -2838,10 +2765,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
        if (!page_has_buffers(page))
                goto out;
-        ext4_da_page_release_reservation(page, offset);
+        ext4_da_page_release_reservation(page, offset, length);
 out:
-        ext4_invalidatepage(page, offset);
+        ext4_invalidatepage(page, offset, length);
        return;
 }
@@ -2864,7 +2791,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * laptop_mode, not even desirable).  However, to do otherwise
         * would require replicating code paths in:
         *
-         * ext4_da_writepages() ->
+         * ext4_writepages() ->
         *    write_cache_pages() ---> (via passed in callback function)
         *        __mpage_da_writepage() -->
         *           mpage_add_bh_to_extent()
@@ -2989,37 +2916,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+                                unsigned int length)
 {
-        trace_ext4_invalidatepage(page, offset);
+        trace_ext4_invalidatepage(page, offset, length);
        /* No journalling happens on data buffers when this function is used */
        WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
-        block_invalidatepage(page, offset);
+        block_invalidatepage(page, offset, length);
 }
 static int __ext4_journalled_invalidatepage(struct page *page,
-                                            unsigned long offset)
+                                            unsigned int offset,
+                                            unsigned int length)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
-        trace_ext4_journalled_invalidatepage(page, offset);
+        trace_ext4_journalled_invalidatepage(page, offset, length);
        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
-        if (offset == 0)
+        if (offset == 0 && length == PAGE_CACHE_SIZE)
                ClearPageChecked(page);
-        return jbd2_journal_invalidatepage(journal, page, offset);
+        return jbd2_journal_invalidatepage(journal, page, offset, length);
 }
 /* Wrapper for aops... */
 static void ext4_journalled_invalidatepage(struct page *page,
-                                           unsigned long offset)
+                                           unsigned int offset,
+                                           unsigned int length)
 {
-        WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
+        WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
 }
 static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3067,9 +2997,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
        struct inode *inode = file_inode(iocb->ki_filp);
        ext4_io_end_t *io_end = iocb->private;
-        /* if not async direct IO or dio with 0 bytes write, just return */
+        /* if not async direct IO just return */
-        if (!io_end || !size)
+        if (!io_end) {
-                goto out;
+                inode_dio_done(inode);
+                if (is_async)
+                        aio_complete(iocb, ret, 0);
+                return;
+        }
        ext_debug("ext4_end_io_dio(): io_end 0x%p "
                  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3077,25 +3011,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  size);
        iocb->private = NULL;
-        /* if not aio dio with unwritten extents, just free io and return */
-        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-                ext4_free_io_end(io_end);
-out:
-                inode_dio_done(inode);
-                if (is_async)
-                        aio_complete(iocb, ret, 0);
-                return;
-        }
        io_end->offset = offset;
        io_end->size = size;
        if (is_async) {
                io_end->iocb = iocb;
                io_end->result = ret;
        }
+        ext4_put_io_end_defer(io_end);
-        ext4_add_complete_io(io_end);
 }
 /*
@@ -3129,6 +3051,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        get_block_t *get_block_func = NULL;
        int dio_flags = 0;
        loff_t final_size = offset + count;
+        ext4_io_end_t *io_end = NULL;
        /* Use the old path for reads and writes beyond i_size. */
        if (rw != WRITE || final_size > inode->i_size)
@@ -3136,11 +3059,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        BUG_ON(iocb->private == NULL);
+        /*
+         * Make all waiters for direct IO properly wait also for extent
+         * conversion. This also disallows race between truncate() and
+         * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+         */
+        if (rw == WRITE)
+                atomic_inc(&inode->i_dio_count);
        /* If we do a overwrite dio, i_mutex locking can be released */
        overwrite = *((int *)iocb->private);
        if (overwrite) {
-                atomic_inc(&inode->i_dio_count);
                down_read(&EXT4_I(inode)->i_data_sem);
                mutex_unlock(&inode->i_mutex);
        }
@@ -3167,13 +3097,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        iocb->private = NULL;
        ext4_inode_aio_set(inode, NULL);
        if (!is_sync_kiocb(iocb)) {
-                ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+                io_end = ext4_init_io_end(inode, GFP_NOFS);
                if (!io_end) {
                        ret = -ENOMEM;
                        goto retake_lock;
                }
                io_end->flag |= EXT4_IO_END_DIRECT;
-                iocb->private = io_end;
+                /*
+                 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+                 */
+                iocb->private = ext4_get_io_end(io_end);
                /*
                 * we save the io structure for current async direct
                 * IO, so that later ext4_map_blocks() could flag the
@@ -3197,33 +3130,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                                   NULL,
                                   dio_flags);
-        if (iocb->private)
-                ext4_inode_aio_set(inode, NULL);
        /*
-         * The io_end structure takes a reference to the inode, that
+         * Put our reference to io_end. This can free the io_end structure e.g.
-         * structure needs to be destroyed and the reference to the
+         * in sync IO case or in case of error. It can even perform extent
-         * inode need to be dropped, when IO is complete, even with 0
+         * conversion if all bios we submitted finished before we got here.
-         * byte write, or failed.
+         * Note that in that case iocb->private can be already set to NULL
-         *
+         * here.
-         * In the successful AIO DIO case, the io_end structure will
-         * be destroyed and the reference to the inode will be dropped
-         * after the end_io call back function is called.
-         *
-         * In the case there is 0 byte write, or error case, since VFS
-         * direct IO won't invoke the end_io call back function, we
-         * need to free the end_io structure here.
         */
-        if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+        if (io_end) {
-                ext4_free_io_end(iocb->private);
+                ext4_inode_aio_set(inode, NULL);
-                iocb->private = NULL;
+                ext4_put_io_end(io_end);
-        } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+                /*
+                 * When no IO was submitted ext4_end_io_dio() was not
+                 * called so we have to put iocb's reference.
+                 */
+                if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+                        WARN_ON(iocb->private != io_end);
+                        WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+                        WARN_ON(io_end->iocb);
+                        /*
+                         * Generic code already did inode_dio_done() so we
+                         * have to clear EXT4_IO_END_DIRECT to not do it for
+                         * the second time.
+                         */
+                        io_end->flag = 0;
+                        ext4_put_io_end(io_end);
+                        iocb->private = NULL;
+                }
+        }
+        if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
                                                EXT4_STATE_DIO_UNWRITTEN)) {
                int err;
                /*
                 * for non AIO case, since the IO is already
                 * completed, we could do the conversion right here
                 */
-                err = ext4_convert_unwritten_extents(inode,
+                err = ext4_convert_unwritten_extents(NULL, inode,
                                                     offset, ret);
                if (err < 0)
                        ret = err;
@@ -3231,9 +3173,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        }
 retake_lock:
+        if (rw == WRITE)
+                inode_dio_done(inode);
        /* take i_mutex locking again if we do a ovewrite dio */
        if (overwrite) {
-                inode_dio_done(inode);
                up_read(&EXT4_I(inode)->i_data_sem);
                mutex_lock(&inode->i_mutex);
        }
@@ -3292,6 +3235,7 @@ static const struct address_space_operations ext4_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
+        .writepages             = ext4_writepages,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_write_end,
        .bmap                   = ext4_bmap,
@@ -3307,6 +3251,7 @@ static const struct address_space_operations ext4_journalled_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
+        .writepages             = ext4_writepages,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_journalled_write_end,
        .set_page_dirty         = ext4_journalled_set_page_dirty,
@@ -3322,7 +3267,7 @@ static const struct address_space_operations ext4_da_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .writepages             = ext4_da_writepages,
+        .writepages             = ext4_writepages,
        .write_begin            = ext4_da_write_begin,
        .write_end              = ext4_da_write_end,
        .bmap                   = ext4_bmap,
@@ -3355,89 +3300,56 @@ void ext4_set_aops(struct inode *inode)
                inode->i_mapping->a_ops = &ext4_aops;
 }
 /*
- * ext4_discard_partial_page_buffers()
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
+ * up to the end of the block which corresponds to `from'.
- * This function finds and locks the page containing the offset
+ * This required during truncate. We need to physically zero the tail end
- * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
+ * of that block so it doesn't yield old data if the file is later grown.
- * Calling functions that already have the page locked should call
- * ext4_discard_partial_page_buffers_no_lock directly.
 */
-int ext4_discard_partial_page_buffers(handle_t *handle,
+int ext4_block_truncate_page(handle_t *handle,
-                struct address_space *mapping, loff_t from,
+                struct address_space *mapping, loff_t from)
-                loff_t length, int flags)
 {
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned length;
+        unsigned blocksize;
        struct inode *inode = mapping->host;
-        struct page *page;
-        int err = 0;
-        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+        blocksize = inode->i_sb->s_blocksize;
-                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
+        length = blocksize - (offset & (blocksize - 1));
-        if (!page)
-                return -ENOMEM;
-        err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
-                from, length, flags);
-        unlock_page(page);
+        return ext4_block_zero_page_range(handle, mapping, from, length);
-        page_cache_release(page);
-        return err;
 }
 /*
- * ext4_discard_partial_page_buffers_no_lock()
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * Zeros a page range of length 'length' starting from offset 'from'.
+ * starting from file offset 'from'.  The range to be zero'd must
- * Buffer heads that correspond to the block aligned regions of the
+ * be contained with in one block.  If the specified range exceeds
- * zeroed range will be unmapped.  Unblock aligned regions
+ * the end of the block it will be shortened to end of the block
- * will have the corresponding buffer head mapped if needed so that
+ * that cooresponds to 'from'
- * that region of the page can be updated with the partial zero out.
- *
- * This function assumes that the page has already been  locked.  The
- * The range to be discarded must be contained with in the given page.
- * If the specified range exceeds the end of the page it will be shortened
- * to the end of the page that corresponds to 'from'.  This function is
- * appropriate for updating a page and it buffer heads to be unmapped and
- * zeroed for blocks that have been either released, or are going to be
- * released.
- *
- * handle: The journal handle
- * inode:  The files inode
- * page:   A locked page that contains the offset "from"
- * from:   The starting byte offset (from the beginning of the file)
- *         to begin discarding
- * len:    The length of bytes to discard
- * flags:  Optional flags that may be used:
- *
- *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
- *         Only zero the regions of the page whose buffer heads
- *         have already been unmapped.  This flag is appropriate
- *         for updating the contents of a page whose blocks may
- *         have already been released, and we only want to zero
- *         out the regions that correspond to those released blocks.
- *
- * Returns zero on success or negative on failure.
 */
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+int ext4_block_zero_page_range(handle_t *handle,
-                struct inode *inode, struct page *page, loff_t from,
+                struct address_space *mapping, loff_t from, loff_t length)
-                loff_t length, int flags)
 {
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-        unsigned int offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned int blocksize, max, pos;
+        unsigned blocksize, max, pos;
        ext4_lblk_t iblock;
+        struct inode *inode = mapping->host;
        struct buffer_head *bh;
+        struct page *page;
        int err = 0;
-        blocksize = inode->i_sb->s_blocksize;
+        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
-        max = PAGE_CACHE_SIZE - offset;
+                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
+        if (!page)
+                return -ENOMEM;
-        if (index != page->index)
+        blocksize = inode->i_sb->s_blocksize;
-                return -EINVAL;
+        max = blocksize - (offset & (blocksize - 1));
        /*
         * correct length if it does not fall between
-         * 'from' and the end of the page
+         * 'from' and the end of the block
         */
        if (length > max || length < 0)
                length = max;
@@ -3455,106 +3367,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
                iblock++;
                pos += blocksize;
        }
+        if (buffer_freed(bh)) {
-        pos = offset;
+                BUFFER_TRACE(bh, "freed: skip");
-        while (pos < offset + length) {
+                goto unlock;
-                unsigned int end_of_block, range_to_discard;
+        }
+        if (!buffer_mapped(bh)) {
-                err = 0;
+                BUFFER_TRACE(bh, "unmapped");
+                ext4_get_block(inode, iblock, bh, 0);
-                /* The length of space left to zero and unmap */
+                /* unmapped? It's a hole - nothing to do */
-                range_to_discard = offset + length - pos;
-                /* The length of space until the end of the block */
-                end_of_block = blocksize - (pos & (blocksize-1));
-                /*
-                 * Do not unmap or zero past end of block
-                 * for this buffer head
-                 */
-                if (range_to_discard > end_of_block)
-                        range_to_discard = end_of_block;
-                /*
-                 * Skip this buffer head if we are only zeroing unampped
-                 * regions of the page
-                 */
-                if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
-                        buffer_mapped(bh))
-                                goto next;
-                /* If the range is block aligned, unmap */
-                if (range_to_discard == blocksize) {
-                        clear_buffer_dirty(bh);
-                        bh->b_bdev = NULL;
-                        clear_buffer_mapped(bh);
-                        clear_buffer_req(bh);
-                        clear_buffer_new(bh);
-                        clear_buffer_delay(bh);
-                        clear_buffer_unwritten(bh);
-                        clear_buffer_uptodate(bh);
-                        zero_user(page, pos, range_to_discard);
-                        BUFFER_TRACE(bh, "Buffer discarded");
-                        goto next;
-                }
-                /*
-                 * If this block is not completely contained in the range
-                 * to be discarded, then it is not going to be released. Because
-                 * we need to keep this block, we need to make sure this part
-                 * of the page is uptodate before we modify it by writeing
-                 * partial zeros on it.
-                 */
                if (!buffer_mapped(bh)) {
-                        /*
+                        BUFFER_TRACE(bh, "still unmapped");
-                         * Buffer head must be mapped before we can read
+                        goto unlock;
-                         * from the block
-                         */
-                        BUFFER_TRACE(bh, "unmapped");
-                        ext4_get_block(inode, iblock, bh, 0);
-                        /* unmapped? It's a hole - nothing to do */
-                        if (!buffer_mapped(bh)) {
-                                BUFFER_TRACE(bh, "still unmapped");
-                                goto next;
-                        }
                }
+        }
-                /* Ok, it's mapped. Make sure it's up-to-date */
+        /* Ok, it's mapped. Make sure it's up-to-date */
-                if (PageUptodate(page))
+        if (PageUptodate(page))
-                        set_buffer_uptodate(bh);
+                set_buffer_uptodate(bh);
-                if (!buffer_uptodate(bh)) {
+        if (!buffer_uptodate(bh)) {
-                        err = -EIO;
+                err = -EIO;
-                        ll_rw_block(READ, 1, &bh);
+                ll_rw_block(READ, 1, &bh);
-                        wait_on_buffer(bh);
+                wait_on_buffer(bh);
-                        /* Uhhuh. Read error. Complain and punt.*/
+                /* Uhhuh. Read error. Complain and punt. */
-                        if (!buffer_uptodate(bh))
+                if (!buffer_uptodate(bh))
-                                goto next;
+                        goto unlock;
-                }
+        }
+        if (ext4_should_journal_data(inode)) {
+                BUFFER_TRACE(bh, "get write access");
+                err = ext4_journal_get_write_access(handle, bh);
+                if (err)
+                        goto unlock;
+        }
+        zero_user(page, offset, length);
+        BUFFER_TRACE(bh, "zeroed end of block");
-                if (ext4_should_journal_data(inode)) {
+        if (ext4_should_journal_data(inode)) {
-                        BUFFER_TRACE(bh, "get write access");
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
-                        err = ext4_journal_get_write_access(handle, bh);
+        } else {
-                        if (err)
+                err = 0;
-                                goto next;
+                mark_buffer_dirty(bh);
-                }
+                if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+                        err = ext4_jbd2_file_inode(handle, inode);
+        }
-                zero_user(page, pos, range_to_discard);
+unlock:
+        unlock_page(page);
+        page_cache_release(page);
+        return err;
+}
-                err = 0;
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
-                if (ext4_should_journal_data(inode)) {
+                             loff_t lstart, loff_t length)
-                        err = ext4_handle_dirty_metadata(handle, inode, bh);
+{
-                } else
+        struct super_block *sb = inode->i_sb;
-                        mark_buffer_dirty(bh);
+        struct address_space *mapping = inode->i_mapping;
+        unsigned partial_start, partial_end;
+        ext4_fsblk_t start, end;
+        loff_t byte_end = (lstart + length - 1);
+        int err = 0;
-                BUFFER_TRACE(bh, "Partial buffer zeroed");
+        partial_start = lstart & (sb->s_blocksize - 1);
-next:
+        partial_end = byte_end & (sb->s_blocksize - 1);
-                bh = bh->b_this_page;
-                iblock++;
-                pos += range_to_discard;
-        }
+        start = lstart >> sb->s_blocksize_bits;
+        end = byte_end >> sb->s_blocksize_bits;
+        /* Handle partial zero within the single block */
+        if (start == end &&
+            (partial_start || (partial_end != sb->s_blocksize - 1))) {
+                err = ext4_block_zero_page_range(handle, mapping,
+                                                 lstart, length);
+                return err;
+        }
+        /* Handle partial zero out on the start of the range */
+        if (partial_start) {
+                err = ext4_block_zero_page_range(handle, mapping,
+                                                 lstart, sb->s_blocksize);
+                if (err)
+                        return err;
+        }
+        /* Handle partial zero out on the end of the range */
+        if (partial_end != sb->s_blocksize - 1)
+                err = ext4_block_zero_page_range(handle, mapping,
+                                                 byte_end - partial_end,
+                                                 partial_end + 1);
        return err;
 }
@@ -3580,14 +3477,12 @@ int ext4_can_truncate(struct inode *inode)
 * Returns: 0 on success or negative on failure
 */
-int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 {
-        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t first_block, stop_block;
        struct address_space *mapping = inode->i_mapping;
-        loff_t first_page, last_page, page_len;
+        loff_t first_block_offset, last_block_offset;
-        loff_t first_page_offset, last_page_offset;
        handle_t *handle;
        unsigned int credits;
        int ret = 0;
@@ -3638,23 +3533,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
                   offset;
        }
-        first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        first_block_offset = round_up(offset, sb->s_blocksize);
-        last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+        last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
-        first_page_offset = first_page << PAGE_CACHE_SHIFT;
+        /* Now release the pages and zero block aligned part of pages*/
-        last_page_offset = last_page << PAGE_CACHE_SHIFT;
+        if (last_block_offset > first_block_offset)
+                truncate_pagecache_range(inode, first_block_offset,
-        /* Now release the pages */
+                                         last_block_offset);
-        if (last_page_offset > first_page_offset) {
-                truncate_pagecache_range(inode, first_page_offset,
-                                         last_page_offset - 1);
-        }
        /* Wait all existing dio workers, newcomers will block on i_mutex */
        ext4_inode_block_unlocked_dio(inode);
-        ret = ext4_flush_unwritten_io(inode);
-        if (ret)
-                goto out_dio;
        inode_dio_wait(inode);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -3668,66 +3556,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
                goto out_dio;
        }
-        /*
+        ret = ext4_zero_partial_blocks(handle, inode, offset,
-         * Now we need to zero out the non-page-aligned data in the
+                                       length);
-         * pages at the start and tail of the hole, and unmap the
+        if (ret)
-         * buffer heads for the block aligned regions of the page that
+                goto out_stop;
-         * were completely zeroed.
-         */
-        if (first_page > last_page) {
-                /*
-                 * If the file space being truncated is contained
-                 * within a page just zero out and unmap the middle of
-                 * that page
-                 */
-                ret = ext4_discard_partial_page_buffers(handle,
-                        mapping, offset, length, 0);
-                if (ret)
-                        goto out_stop;
-        } else {
-                /*
-                 * zero out and unmap the partial page that contains
-                 * the start of the hole
-                 */
-                page_len = first_page_offset - offset;
-                if (page_len > 0) {
-                        ret = ext4_discard_partial_page_buffers(handle, mapping,
-                                                offset, page_len, 0);
-                        if (ret)
-                                goto out_stop;
-                }
-                /*
-                 * zero out and unmap the partial page that contains
-                 * the end of the hole
-                 */
-                page_len = offset + length - last_page_offset;
-                if (page_len > 0) {
-                        ret = ext4_discard_partial_page_buffers(handle, mapping,
-                                        last_page_offset, page_len, 0);
-                        if (ret)
-                                goto out_stop;
-                }
-        }
-        /*
-         * If i_size is contained in the last page, we need to
-         * unmap and zero the partial page after i_size
-         */
-        if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
-           inode->i_size % PAGE_CACHE_SIZE != 0) {
-                page_len = PAGE_CACHE_SIZE -
-                        (inode->i_size & (PAGE_CACHE_SIZE - 1));
-                if (page_len > 0) {
-                        ret = ext4_discard_partial_page_buffers(handle,
-                                        mapping, inode->i_size, page_len, 0);
-                        if (ret)
-                                goto out_stop;
-                }
-        }
        first_block = (offset + sb->s_blocksize - 1) >>
                EXT4_BLOCK_SIZE_BITS(sb);
@@ -3803,7 +3635,6 @@ void ext4_truncate(struct inode *inode)
        unsigned int credits;
        handle_t *handle;
        struct address_space *mapping = inode->i_mapping;
-        loff_t page_len;
        /*
         * There is a possibility that we're either freeing the inode
@@ -3830,12 +3661,6 @@ void ext4_truncate(struct inode *inode)
                        return;
        }
-        /*
-         * finish any pending end_io work so we won't run the risk of
-         * converting any truncated blocks to initialized later
-         */
-        ext4_flush_unwritten_io(inode);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                credits = ext4_writepage_trans_blocks(inode);
        else
@@ -3847,14 +3672,8 @@ void ext4_truncate(struct inode *inode)
                return;
        }
-        if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+        if (inode->i_size & (inode->i_sb->s_blocksize - 1))
-                page_len = PAGE_CACHE_SIZE -
+                ext4_block_truncate_page(handle, mapping, inode->i_size);
-                        (inode->i_size & (PAGE_CACHE_SIZE - 1));
-                if (ext4_discard_partial_page_buffers(handle,
-                                mapping, inode->i_size, page_len, 0))
-                        goto out_stop;
-        }
        /*
         * We add the inode to the orphan list, so that if this
@@ -4623,7 +4442,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
                                      inode->i_size >> PAGE_CACHE_SHIFT);
                if (!page)
                        return;
-                ret = __ext4_journalled_invalidatepage(page, offset);
+                ret = __ext4_journalled_invalidatepage(page, offset,
+                                                PAGE_CACHE_SIZE - offset);
                unlock_page(page);
                page_cache_release(page);
                if (ret != -EBUSY)
@@ -4805,7 +4625,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
                 struct kstat *stat)
 {
        struct inode *inode;
-        unsigned long delalloc_blocks;
+        unsigned long long delalloc_blocks;
        inode = dentry->d_inode;
        generic_fillattr(inode, stat);
@@ -4823,15 +4643,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
        delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
                                EXT4_I(inode)->i_reserved_data_blocks);
-        stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+        stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
        return 0;
 }
-static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+                                   int pextents)
 {
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-                return ext4_ind_trans_blocks(inode, nrblocks, chunk);
+                return ext4_ind_trans_blocks(inode, lblocks);
-        return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+        return ext4_ext_index_trans_blocks(inode, pextents);
 }
 /*
@@ -4845,7 +4666,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 *
 * Also account for superblock, inode, quota and xattr blocks
 */
-static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+                                  int pextents)
 {
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
@@ -4853,14 +4675,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
        int ret = 0;
        /*
-         * How many index blocks need to touch to modify nrblocks?
+         * How many index blocks need to touch to map @lblocks logical blocks
-         * The "Chunk" flag indicating whether the nrblocks is
+         * to @pextents physical extents?
-         * physically contiguous on disk
-         *
-         * For Direct IO and fallocate, they calls get_block to allocate
-         * one single extent at a time, so they could set the "Chunk" flag
         */
-        idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+        idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
        ret = idxblocks;
@@ -4868,12 +4686,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
         * Now let's see how many group bitmaps and group descriptors need
         * to account
         */
-        groups = idxblocks;
+        groups = idxblocks + pextents;
-        if (chunk)
-                groups += 1;
-        else
-                groups += nrblocks;
        gdpblocks = groups;
        if (groups > ngroups)
                groups = ngroups;
@@ -4904,7 +4717,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
        int bpp = ext4_journal_blocks_per_page(inode);
        int ret;
-        ret = ext4_meta_trans_blocks(inode, bpp, 0);
+        ret = ext4_meta_trans_blocks(inode, bpp, bpp);
        /* Account for data blocks for journalled mode */
        if (ext4_should_journal_data(inode))
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 9491ac0590f7..c0427e2f6648 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -77,8 +77,10 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
        memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
        memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
        memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
-        memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
+        ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
-        memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
+        ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
+        ext4_es_lru_del(inode1);
+        ext4_es_lru_del(inode2);
        isize = i_size_read(inode1);
        i_size_write(inode1, i_size_read(inode2));
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index def84082a9a9..4bbbf13bd743 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2105,6 +2105,7 @@ repeat:
                group = ac->ac_g_ex.fe_group;
                for (i = 0; i < ngroups; group++, i++) {
+                        cond_resched();
                        /*
                         * Artificially restricted ngroups for non-extent
                         * files makes group > ngroups possible on first loop.
@@ -4405,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 repeat:
                /* allocate space in core */
                *errp = ext4_mb_regular_allocator(ac);
-                if (*errp) {
+                if (*errp)
-                        ext4_discard_allocated_blocks(ac);
+                        goto discard_and_exit;
-                        goto errout;
-                }
                /* as we've just preallocated more space than
-                 * user requested orinally, we store allocated
+                 * user requested originally, we store allocated
                 * space in a special descriptor */
                if (ac->ac_status == AC_STATUS_FOUND &&
-                                ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+                    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
-                        ext4_mb_new_preallocation(ac);
+                        *errp = ext4_mb_new_preallocation(ac);
+                if (*errp) {
+                discard_and_exit:
+                        ext4_discard_allocated_blocks(ac);
+                        goto errout;
+                }
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4612,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                BUG_ON(bh && (count > 1));
                for (i = 0; i < count; i++) {
+                        cond_resched();
                        if (!bh)
                                tbh = sb_find_get_block(inode->i_sb,
                                                        block + i);
-                        if (unlikely(!tbh))
+                        if (!tbh)
                                continue;
                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                    inode, tbh, block + i);
@@ -4735,11 +4740,16 @@ do_more:
                 * blocks being freed are metadata. these blocks shouldn't
                 * be used until this transaction is committed
                 */
+        retry:
                new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
                if (!new_entry) {
-                        ext4_mb_unload_buddy(&e4b);
+                        /*
-                        err = -ENOMEM;
+                         * We use a retry loop because
-                        goto error_return;
+                         * ext4_free_blocks() is not allowed to fail.
+                         */
+                        cond_resched();
+                        congestion_wait(BLK_RW_ASYNC, HZ/50);
+                        goto retry;
                }
                new_entry->efd_start_cluster = bit;
                new_entry->efd_group = block_group;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3dcbf364022f..e86dddbd8296 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -912,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        struct page *pagep[2] = {NULL, NULL};
        handle_t *handle;
        ext4_lblk_t orig_blk_offset;
-        long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
        unsigned long blocksize = orig_inode->i_sb->s_blocksize;
        unsigned int w_flags = 0;
        unsigned int tmp_data_size, data_size, replaced_size;
@@ -940,8 +939,6 @@ again:
        orig_blk_offset = orig_page_offset * blocks_per_page +
                data_offset_in_page;
-        offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
        /* Calculate data_size */
        if ((orig_blk_offset + block_len_in_page - 1) ==
            ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6653fc35ecb7..35f55a0dbc4b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -918,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                                bh->b_data, bh->b_size,
                                (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
                                         + ((char *)de - bh->b_data))) {
-                        /* On error, skip the f_pos to the next block. */
+                        /* silently ignore the rest of the block */
-                        dir_file->f_pos = (dir_file->f_pos |
+                        break;
-                                        (dir->i_sb->s_blocksize - 1)) + 1;
-                        brelse(bh);
-                        return count;
                }
                ext4fs_dirhash(de->name, de->name_len, hinfo);
                if ((hinfo->hash < start_hash) ||
@@ -2299,6 +2296,45 @@ retry:
        return err;
 }
+static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+        handle_t *handle;
+        struct inode *inode;
+        int err, retries = 0;
+        dquot_initialize(dir);
+retry:
+        inode = ext4_new_inode_start_handle(dir, mode,
+                                            NULL, 0, NULL,
+                                            EXT4_HT_DIR,
+                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+                          4 + EXT4_XATTR_TRANS_BLOCKS);
+        handle = ext4_journal_current_handle();
+        err = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                inode->i_op = &ext4_file_inode_operations;
+                inode->i_fop = &ext4_file_operations;
+                ext4_set_aops(inode);
+                d_tmpfile(dentry, inode);
+                err = ext4_orphan_add(handle, inode);
+                if (err)
+                        goto err_drop_inode;
+                mark_inode_dirty(inode);
+                unlock_new_inode(inode);
+        }
+        if (handle)
+                ext4_journal_stop(handle);
+        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+                goto retry;
+        return err;
+err_drop_inode:
+        ext4_journal_stop(handle);
+        unlock_new_inode(inode);
+        iput(inode);
+        return err;
+}
 struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
                          struct ext4_dir_entry_2 *de,
                          int blocksize, int csum_size,
@@ -2906,7 +2942,7 @@ static int ext4_link(struct dentry *old_dentry,
 retry:
        handle = ext4_journal_start(dir, EXT4_HT_DIR,
                (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-                 EXT4_INDEX_EXTRA_TRANS_BLOCKS));
+                 EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2920,6 +2956,11 @@ retry:
        err = ext4_add_entry(handle, dentry, inode);
        if (!err) {
                ext4_mark_inode_dirty(handle, inode);
+                /* this can happen only for tmpfile being
+                 * linked the first time
+                 */
+                if (inode->i_nlink == 1)
+                        ext4_orphan_del(handle, inode);
                d_instantiate(dentry, inode);
        } else {
                drop_nlink(inode);
@@ -3172,6 +3213,7 @@ const struct inode_operations ext4_dir_inode_operations = {
        .mkdir          = ext4_mkdir,
        .rmdir          = ext4_rmdir,
        .mknod          = ext4_mknod,
+        .tmpfile        = ext4_tmpfile,
        .rename         = ext4_rename,
        .setattr        = ext4_setattr,
        .setxattr       = generic_setxattr,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4acf1f78881b..6625d210fb45 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -25,6 +25,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/ratelimit.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -46,46 +47,121 @@ void ext4_exit_pageio(void)
 }
 /*
- * This function is called by ext4_evict_inode() to make sure there is
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
- * no more pending I/O completion work left to do.
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
 */
-void ext4_ioend_shutdown(struct inode *inode)
+static void buffer_io_error(struct buffer_head *bh)
+{
+        char b[BDEVNAME_SIZE];
+        printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+                        bdevname(bh->b_bdev, b),
+                        (unsigned long long)bh->b_blocknr);
+}
+static void ext4_finish_bio(struct bio *bio)
 {
-        wait_queue_head_t *wq = ext4_ioend_wq(inode);
+        int i;
+        int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
-        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+        for (i = 0; i < bio->bi_vcnt; i++) {
-        /*
+                struct bio_vec *bvec = &bio->bi_io_vec[i];
-         * We need to make sure the work structure is finished being
+                struct page *page = bvec->bv_page;
-         * used before we let the inode get destroyed.
+                struct buffer_head *bh, *head;
-         */
+                unsigned bio_start = bvec->bv_offset;
-        if (work_pending(&EXT4_I(inode)->i_unwritten_work))
+                unsigned bio_end = bio_start + bvec->bv_len;
-                cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
+                unsigned under_io = 0;
+                unsigned long flags;
+                if (!page)
+                        continue;
+                if (error) {
+                        SetPageError(page);
+                        set_bit(AS_EIO, &page->mapping->flags);
+                }
+                bh = head = page_buffers(page);
+                /*
+                 * We check all buffers in the page under BH_Uptodate_Lock
+                 * to avoid races with other end io clearing async_write flags
+                 */
+                local_irq_save(flags);
+                bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+                do {
+                        if (bh_offset(bh) < bio_start ||
+                            bh_offset(bh) + bh->b_size > bio_end) {
+                                if (buffer_async_write(bh))
+                                        under_io++;
+                                continue;
+                        }
+                        clear_buffer_async_write(bh);
+                        if (error)
+                                buffer_io_error(bh);
+                } while ((bh = bh->b_this_page) != head);
+                bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+                local_irq_restore(flags);
+                if (!under_io)
+                        end_page_writeback(page);
+        }
+}
+static void ext4_release_io_end(ext4_io_end_t *io_end)
+{
+        struct bio *bio, *next_bio;
+        BUG_ON(!list_empty(&io_end->list));
+        BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+        WARN_ON(io_end->handle);
+        if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
+                wake_up_all(ext4_ioend_wq(io_end->inode));
+        for (bio = io_end->bio; bio; bio = next_bio) {
+                next_bio = bio->bi_private;
+                ext4_finish_bio(bio);
+                bio_put(bio);
+        }
+        if (io_end->flag & EXT4_IO_END_DIRECT)
+                inode_dio_done(io_end->inode);
+        if (io_end->iocb)
+                aio_complete(io_end->iocb, io_end->result, 0);
+        kmem_cache_free(io_end_cachep, io_end);
 }
-void ext4_free_io_end(ext4_io_end_t *io)
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 {
-        BUG_ON(!io);
+        struct inode *inode = io_end->inode;
-        BUG_ON(!list_empty(&io->list));
-        BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
-        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
+        io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
-                wake_up_all(ext4_ioend_wq(io->inode));
+        /* Wake up anyone waiting on unwritten extent conversion */
-        kmem_cache_free(io_end_cachep, io);
+        if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+                wake_up_all(ext4_ioend_wq(inode));
 }
-/* check a range of space and convert unwritten extents to written. */
+/*
+ * Check a range of space and convert unwritten extents to written. Note that
+ * we are protected from truncate touching same part of extent tree by the
+ * fact that truncate code waits for all DIO to finish (thus exclusion from
+ * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
+ * completed (happens from ext4_free_ioend()).
+ */
 static int ext4_end_io(ext4_io_end_t *io)
 {
        struct inode *inode = io->inode;
        loff_t offset = io->offset;
        ssize_t size = io->size;
+        handle_t *handle = io->handle;
        int ret = 0;
        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
                   "list->prev 0x%p\n",
                   io, inode->i_ino, io->list.next, io->list.prev);
-        ret = ext4_convert_unwritten_extents(inode, offset, size);
+        io->handle = NULL;      /* Following call will use up the handle */
+        ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
        if (ret < 0) {
                ext4_msg(inode->i_sb, KERN_EMERG,
                         "failed to convert unwritten extents to written "
@@ -93,30 +169,22 @@ static int ext4_end_io(ext4_io_end_t *io)
                         "(inode %lu, offset %llu, size %zd, error %d)",
                         inode->i_ino, offset, size, ret);
        }
-        /* Wake up anyone waiting on unwritten extent conversion */
+        ext4_clear_io_unwritten_flag(io);
-        if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+        ext4_release_io_end(io);
-                wake_up_all(ext4_ioend_wq(inode));
-        if (io->flag & EXT4_IO_END_DIRECT)
-                inode_dio_done(inode);
-        if (io->iocb)
-                aio_complete(io->iocb, io->result, 0);
        return ret;
 }
-static void dump_completed_IO(struct inode *inode)
+static void dump_completed_IO(struct inode *inode, struct list_head *head)
 {
 #ifdef  EXT4FS_DEBUG
        struct list_head *cur, *before, *after;
        ext4_io_end_t *io, *io0, *io1;
-        if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
+        if (list_empty(head))
-                ext4_debug("inode %lu completed_io list is empty\n",
-                           inode->i_ino);
                return;
-        }
-        ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
+        ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
-        list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
+        list_for_each_entry(io, head, list) {
                cur = &io->list;
                before = cur->prev;
                io0 = container_of(before, ext4_io_end_t, list);
@@ -130,23 +198,30 @@ static void dump_completed_IO(struct inode *inode)
 }
 /* Add the io_end to per-inode completed end_io list. */
-void ext4_add_complete_io(ext4_io_end_t *io_end)
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
 {
        struct ext4_inode_info *ei = EXT4_I(io_end->inode);
        struct workqueue_struct *wq;
        unsigned long flags;
        BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
-        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        if (list_empty(&ei->i_completed_io_list))
+        if (io_end->handle) {
-                queue_work(wq, &ei->i_unwritten_work);
+                wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
-        list_add_tail(&io_end->list, &ei->i_completed_io_list);
+                if (list_empty(&ei->i_rsv_conversion_list))
+                        queue_work(wq, &ei->i_rsv_conversion_work);
+                list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
+        } else {
+                wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
+                if (list_empty(&ei->i_unrsv_conversion_list))
+                        queue_work(wq, &ei->i_unrsv_conversion_work);
+                list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
+        }
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 }
-static int ext4_do_flush_completed_IO(struct inode *inode)
+static int ext4_do_flush_completed_IO(struct inode *inode,
+                                      struct list_head *head)
 {
        ext4_io_end_t *io;
        struct list_head unwritten;
@@ -155,8 +230,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
        int err, ret = 0;
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        dump_completed_IO(inode);
+        dump_completed_IO(inode, head);
-        list_replace_init(&ei->i_completed_io_list, &unwritten);
+        list_replace_init(head, &unwritten);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        while (!list_empty(&unwritten)) {
@@ -167,30 +242,25 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
                err = ext4_end_io(io);
                if (unlikely(!ret && err))
                        ret = err;
-                io->flag &= ~EXT4_IO_END_UNWRITTEN;
-                ext4_free_io_end(io);
        }
        return ret;
 }
 /*
- * work on completed aio dio IO, to convert unwritten extents to extents
+ * work on completed IO, to convert unwritten extents to extents
 */
-void ext4_end_io_work(struct work_struct *work)
+void ext4_end_io_rsv_work(struct work_struct *work)
 {
        struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
-                                                  i_unwritten_work);
+                                                  i_rsv_conversion_work);
-        ext4_do_flush_completed_IO(&ei->vfs_inode);
+        ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
 }
-int ext4_flush_unwritten_io(struct inode *inode)
+void ext4_end_io_unrsv_work(struct work_struct *work)
 {
-        int ret;
+        struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
-        WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
+                                                  i_unrsv_conversion_work);
-                     !(inode->i_state & I_FREEING));
+        ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
-        ret = ext4_do_flush_completed_IO(inode);
-        ext4_unwritten_wait(inode);
-        return ret;
 }
 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -200,83 +270,59 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
                atomic_inc(&EXT4_I(inode)->i_ioend_count);
                io->inode = inode;
                INIT_LIST_HEAD(&io->list);
+                atomic_set(&io->count, 1);
        }
        return io;
 }
-/*
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
- * Print an buffer I/O error compatible with the fs/buffer.c.  This
- * provides compatibility with dmesg scrapers that look for a specific
- * buffer I/O error message.  We really need a unified error reporting
- * structure to userspace ala Digital Unix's uerf system, but it's
- * probably not going to happen in my lifetime, due to LKML politics...
- */
-static void buffer_io_error(struct buffer_head *bh)
 {
-        char b[BDEVNAME_SIZE];
+        if (atomic_dec_and_test(&io_end->count)) {
-        printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+                if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
-                        bdevname(bh->b_bdev, b),
+                        ext4_release_io_end(io_end);
-                        (unsigned long long)bh->b_blocknr);
+                        return;
+                }
+                ext4_add_complete_io(io_end);
+        }
 }
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+        int err = 0;
+        if (atomic_dec_and_test(&io_end->count)) {
+                if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+                        err = ext4_convert_unwritten_extents(io_end->handle,
+                                                io_end->inode, io_end->offset,
+                                                io_end->size);
+                        io_end->handle = NULL;
+                        ext4_clear_io_unwritten_flag(io_end);
+                }
+                ext4_release_io_end(io_end);
+        }
+        return err;
+}
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+        atomic_inc(&io_end->count);
+        return io_end;
+}
+/* BIO completion function for page writeback */
 static void ext4_end_bio(struct bio *bio, int error)
 {
        ext4_io_end_t *io_end = bio->bi_private;
-        struct inode *inode;
-        int i;
-        int blocksize;
        sector_t bi_sector = bio->bi_sector;
        BUG_ON(!io_end);
-        inode = io_end->inode;
-        blocksize = 1 << inode->i_blkbits;
-        bio->bi_private = NULL;
        bio->bi_end_io = NULL;
        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
                error = 0;
-        for (i = 0; i < bio->bi_vcnt; i++) {
-                struct bio_vec *bvec = &bio->bi_io_vec[i];
-                struct page *page = bvec->bv_page;
-                struct buffer_head *bh, *head;
-                unsigned bio_start = bvec->bv_offset;
-                unsigned bio_end = bio_start + bvec->bv_len;
-                unsigned under_io = 0;
-                unsigned long flags;
-                if (!page)
-                        continue;
-                if (error) {
-                        SetPageError(page);
-                        set_bit(AS_EIO, &page->mapping->flags);
-                }
-                bh = head = page_buffers(page);
-                /*
-                 * We check all buffers in the page under BH_Uptodate_Lock
-                 * to avoid races with other end io clearing async_write flags
-                 */
-                local_irq_save(flags);
-                bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
-                do {
-                        if (bh_offset(bh) < bio_start ||
-                            bh_offset(bh) + blocksize > bio_end) {
-                                if (buffer_async_write(bh))
-                                        under_io++;
-                                continue;
-                        }
-                        clear_buffer_async_write(bh);
-                        if (error)
-                                buffer_io_error(bh);
-                } while ((bh = bh->b_this_page) != head);
-                bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
-                local_irq_restore(flags);
-                if (!under_io)
-                        end_page_writeback(page);
-        }
-        bio_put(bio);
        if (error) {
-                io_end->flag |= EXT4_IO_END_ERROR;
+                struct inode *inode = io_end->inode;
                ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
                             "(offset %llu size %ld starting block %llu)",
                             inode->i_ino,
@@ -286,12 +332,23 @@ static void ext4_end_bio(struct bio *bio, int error)
                             bi_sector >> (inode->i_blkbits - 9));
        }
-        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+        if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
-                ext4_free_io_end(io_end);
+                /*
-                return;
+                 * Link bio into list hanging from io_end. We have to do it
+                 * atomically as bio completions can be racing against each
+                 * other.
+                 */
+                bio->bi_private = xchg(&io_end->bio, bio);
+                ext4_put_io_end_defer(io_end);
+        } else {
+                /*
+                 * Drop io_end reference early. Inode can get freed once
+                 * we finish the bio.
+                 */
+                ext4_put_io_end_defer(io_end);
+                ext4_finish_bio(bio);
+                bio_put(bio);
        }
-        ext4_add_complete_io(io_end);
 }
 void ext4_io_submit(struct ext4_io_submit *io)
@@ -305,43 +362,38 @@ void ext4_io_submit(struct ext4_io_submit *io)
                bio_put(io->io_bio);
        }
        io->io_bio = NULL;
-        io->io_op = 0;
+}
+void ext4_io_submit_init(struct ext4_io_submit *io,
+                         struct writeback_control *wbc)
+{
+        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
+        io->io_bio = NULL;
        io->io_end = NULL;
 }
-static int io_submit_init(struct ext4_io_submit *io,
+static int io_submit_init_bio(struct ext4_io_submit *io,
-                          struct inode *inode,
+                              struct buffer_head *bh)
-                          struct writeback_control *wbc,
-                          struct buffer_head *bh)
 {
-        ext4_io_end_t *io_end;
-        struct page *page = bh->b_page;
        int nvecs = bio_get_nr_vecs(bh->b_bdev);
        struct bio *bio;
-        io_end = ext4_init_io_end(inode, GFP_NOFS);
-        if (!io_end)
-                return -ENOMEM;
        bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+        if (!bio)
+                return -ENOMEM;
        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
-        bio->bi_private = io->io_end = io_end;
        bio->bi_end_io = ext4_end_bio;
+        bio->bi_private = ext4_get_io_end(io->io_end);
-        io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
        io->io_bio = bio;
-        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
        io->io_next_block = bh->b_blocknr;
        return 0;
 }
 static int io_submit_add_bh(struct ext4_io_submit *io,
                            struct inode *inode,
-                            struct writeback_control *wbc,
                            struct buffer_head *bh)
 {
-        ext4_io_end_t *io_end;
        int ret;
        if (io->io_bio && bh->b_blocknr != io->io_next_block) {
@@ -349,18 +401,14 @@ submit_and_retry:
                ext4_io_submit(io);
        }
        if (io->io_bio == NULL) {
-                ret = io_submit_init(io, inode, wbc, bh);
+                ret = io_submit_init_bio(io, bh);
                if (ret)
                        return ret;
        }
-        io_end = io->io_end;
-        if (test_clear_buffer_uninit(bh))
-                ext4_set_io_unwritten_flag(inode, io_end);
-        io->io_end->size += bh->b_size;
-        io->io_next_block++;
        ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
        if (ret != bh->b_size)
                goto submit_and_retry;
+        io->io_next_block++;
        return 0;
 }
@@ -432,7 +480,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        do {
                if (!buffer_async_write(bh))
                        continue;
-                ret = io_submit_add_bh(io, inode, wbc, bh);
+                ret = io_submit_add_bh(io, inode, bh);
                if (ret) {
                        /*
                         * We only get here on ENOMEM.  Not much else
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b27c96d01965..c5adbb318a90 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb,
        ext4_fsblk_t end = start + input->blocks_count;
        ext4_group_t group = input->group;
        ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
-        unsigned overhead = ext4_group_overhead_blocks(sb, group);
+        unsigned overhead;
-        ext4_fsblk_t metaend = start + overhead;
+        ext4_fsblk_t metaend;
        struct buffer_head *bh = NULL;
        ext4_grpblk_t free_blocks_count, offset;
        int err = -EINVAL;
+        if (group != sbi->s_groups_count) {
+                ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+                             input->group, sbi->s_groups_count);
+                return -EINVAL;
+        }
+        overhead = ext4_group_overhead_blocks(sb, group);
+        metaend = start + overhead;
        input->free_blocks_count = free_blocks_count =
                input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
@@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb,
                       free_blocks_count, input->reserved_blocks);
        ext4_get_group_no_and_offset(sb, start, NULL, &offset);
-        if (group != sbi->s_groups_count)
+        if (offset != 0)
-                ext4_warning(sb, "Cannot add at group %u (only %u groups)",
-                             input->group, sbi->s_groups_count);
-        else if (offset != 0)
                        ext4_warning(sb, "Last group not full");
        else if (input->reserved_blocks > input->blocks_count / 5)
                ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -1551,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
                le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
        struct inode *inode = NULL;
-        int gdb_off, gdb_num;
+        int gdb_off;
        int err;
        __u16 bg_flags = 0;
-        gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
        gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
        if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -1656,12 +1660,10 @@ errout:
                err = err2;
        if (!err) {
-                ext4_fsblk_t first_block;
-                first_block = ext4_group_first_block_no(sb, 0);
                if (test_opt(sb, DEBUG))
                        printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
                               "blocks\n", ext4_blocks_count(es));
-                update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block,
+                update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
                               (char *)es, sizeof(struct ext4_super_block), 0);
        }
        return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94cc84db7c9a..b59373b625e9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 static void ext4_clear_journal_err(struct super_block *sb,
                                   struct ext4_super_block *es);
 static int ext4_sync_fs(struct super_block *sb, int wait);
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
 static int ext4_remount(struct super_block *sb, int *flags, char *data);
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
@@ -398,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb)
        }
        if (test_opt(sb, ERRORS_RO)) {
                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+                /*
+                 * Make sure updated value of ->s_mount_flags will be visible
+                 * before ->s_flags update
+                 */
+                smp_wmb();
                sb->s_flags |= MS_RDONLY;
        }
        if (test_opt(sb, ERRORS_PANIC))
@@ -422,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function,
        ext4_handle_error(sb);
 }
-void ext4_error_inode(struct inode *inode, const char *function,
+void __ext4_error_inode(struct inode *inode, const char *function,
-                      unsigned int line, ext4_fsblk_t block,
+                        unsigned int line, ext4_fsblk_t block,
-                      const char *fmt, ...)
+                        const char *fmt, ...)
 {
        va_list args;
        struct va_format vaf;
@@ -451,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function,
        ext4_handle_error(inode->i_sb);
 }
-void ext4_error_file(struct file *file, const char *function,
+void __ext4_error_file(struct file *file, const char *function,
-                     unsigned int line, ext4_fsblk_t block,
+                       unsigned int line, ext4_fsblk_t block,
-                     const char *fmt, ...)
+                       const char *fmt, ...)
 {
        va_list args;
        struct va_format vaf;
@@ -570,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function,
        if ((sb->s_flags & MS_RDONLY) == 0) {
                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
-                sb->s_flags |= MS_RDONLY;
                EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+                /*
+                 * Make sure updated value of ->s_mount_flags will be visible
+                 * before ->s_flags update
+                 */
+                smp_wmb();
+                sb->s_flags |= MS_RDONLY;
                if (EXT4_SB(sb)->s_journal)
                        jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
                save_error_info(sb, function, line);
@@ -580,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
                panic("EXT4-fs panic from previous error\n");
 }
-void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+void __ext4_msg(struct super_block *sb,
+                const char *prefix, const char *fmt, ...)
 {
        struct va_format vaf;
        va_list args;
@@ -750,8 +762,10 @@ static void ext4_put_super(struct super_block *sb)
        ext4_unregister_li_request(sb);
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
-        flush_workqueue(sbi->dio_unwritten_wq);
+        flush_workqueue(sbi->unrsv_conversion_wq);
-        destroy_workqueue(sbi->dio_unwritten_wq);
+        flush_workqueue(sbi->rsv_conversion_wq);
+        destroy_workqueue(sbi->unrsv_conversion_wq);
+        destroy_workqueue(sbi->rsv_conversion_wq);
        if (sbi->s_journal) {
                err = jbd2_journal_destroy(sbi->s_journal);
@@ -760,7 +774,7 @@ static void ext4_put_super(struct super_block *sb)
                        ext4_abort(sb, "Couldn't clean up the journal");
        }
-        ext4_es_unregister_shrinker(sb);
+        ext4_es_unregister_shrinker(sbi);
        del_timer(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
@@ -849,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        rwlock_init(&ei->i_es_lock);
        INIT_LIST_HEAD(&ei->i_es_lru);
        ei->i_es_lru_nr = 0;
+        ei->i_touch_when = 0;
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
@@ -859,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_reserved_quota = 0;
 #endif
        ei->jinode = NULL;
-        INIT_LIST_HEAD(&ei->i_completed_io_list);
+        INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+        INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
        spin_lock_init(&ei->i_completed_io_lock);
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
        atomic_set(&ei->i_ioend_count, 0);
        atomic_set(&ei->i_unwritten, 0);
-        INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work);
+        INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+        INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
        return &ei->vfs_inode;
 }
@@ -1093,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = {
        .dirty_inode    = ext4_dirty_inode,
        .drop_inode     = ext4_drop_inode,
        .evict_inode    = ext4_evict_inode,
+        .sync_fs        = ext4_sync_fs_nojournal,
        .put_super      = ext4_put_super,
        .statfs         = ext4_statfs,
        .remount_fs     = ext4_remount,
@@ -1341,7 +1359,7 @@ static const struct mount_opts {
        {Opt_delalloc, EXT4_MOUNT_DELALLOC,
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
-         MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT},
+         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
         MOPT_EXT4_ONLY | MOPT_SET},
        {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
@@ -1684,12 +1702,6 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
        if (sbi->s_qf_names[GRPQUOTA])
                seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-        if (test_opt(sb, USRQUOTA))
-                seq_puts(seq, ",usrquota");
-        if (test_opt(sb, GRPQUOTA))
-                seq_puts(seq, ",grpquota");
 #endif
 }
@@ -1908,7 +1920,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = NULL;
        ext4_group_t flex_group;
-        unsigned int groups_per_flex = 0;
        int i, err;
        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -1916,7 +1927,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
                sbi->s_log_groups_per_flex = 0;
                return 1;
        }
-        groups_per_flex = 1U << sbi->s_log_groups_per_flex;
        err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
        if (err)
@@ -2164,19 +2174,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
                dquot_initialize(inode);
                if (inode->i_nlink) {
-                        ext4_msg(sb, KERN_DEBUG,
+                        if (test_opt(sb, DEBUG))
-                                "%s: truncating inode %lu to %lld bytes",
+                                ext4_msg(sb, KERN_DEBUG,
-                                __func__, inode->i_ino, inode->i_size);
+                                        "%s: truncating inode %lu to %lld bytes",
+                                        __func__, inode->i_ino, inode->i_size);
                        jbd_debug(2, "truncating inode %lu to %lld bytes\n",
                                  inode->i_ino, inode->i_size);
                        mutex_lock(&inode->i_mutex);
+                        truncate_inode_pages(inode->i_mapping, inode->i_size);
                        ext4_truncate(inode);
                        mutex_unlock(&inode->i_mutex);
                        nr_truncates++;
                } else {
-                        ext4_msg(sb, KERN_DEBUG,
+                        if (test_opt(sb, DEBUG))
-                                "%s: deleting unreferenced inode %lu",
+                                ext4_msg(sb, KERN_DEBUG,
-                                __func__, inode->i_ino);
+                                        "%s: deleting unreferenced inode %lu",
+                                        __func__, inode->i_ino);
                        jbd_debug(2, "deleting unreferenced inode %lu\n",
                                  inode->i_ino);
                        nr_orphans++;
@@ -2377,7 +2390,10 @@ struct ext4_attr {
        ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
        ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
                         const char *, size_t);
-        int offset;
+        union {
+                int offset;
+                int deprecated_val;
+        } u;
 };
 static int parse_strtoull(const char *buf,
@@ -2446,7 +2462,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 static ssize_t sbi_ui_show(struct ext4_attr *a,
                           struct ext4_sb_info *sbi, char *buf)
 {
-        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
        return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
 }
@@ -2455,7 +2471,7 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
                            struct ext4_sb_info *sbi,
                            const char *buf, size_t count)
 {
-        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
        unsigned long t;
        int ret;
@@ -2504,12 +2520,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
        return count;
 }
+static ssize_t sbi_deprecated_show(struct ext4_attr *a,
+                                   struct ext4_sb_info *sbi, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+}
 #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
 static struct ext4_attr ext4_attr_##_name = {                   \
        .attr = {.name = __stringify(_name), .mode = _mode },   \
        .show   = _show,                                        \
        .store  = _store,                                       \
-        .offset = offsetof(struct ext4_sb_info, _elname),       \
+        .u = {                                                  \
+                .offset = offsetof(struct ext4_sb_info, _elname),\
+        },                                                      \
 }
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2520,6 +2544,14 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)       \
        EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
 #define ATTR_LIST(name) &ext4_attr_##name.attr
+#define EXT4_DEPRECATED_ATTR(_name, _val)       \
+static struct ext4_attr ext4_attr_##_name = {                   \
+        .attr = {.name = __stringify(_name), .mode = 0444 },    \
+        .show   = sbi_deprecated_show,                          \
+        .u = {                                                  \
+                .deprecated_val = _val,                         \
+        },                                                      \
+}
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
@@ -2534,7 +2566,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
@@ -3451,7 +3483,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                }
                if (test_opt(sb, DIOREAD_NOLOCK)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
-                                 "both data=journal and delalloc");
+                                 "both data=journal and dioread_nolock");
                        goto failed_mount;
                }
                if (test_opt(sb, DELALLOC))
@@ -3586,10 +3618,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
        sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
-        /* Do we have standard group size of blocksize * 8 blocks ? */
-        if (sbi->s_blocks_per_group == blocksize << 3)
-                set_opt2(sb, STD_GROUP_SIZE);
        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3659,6 +3687,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
+        /* Do we have standard group size of clustersize * 8 blocks ? */
+        if (sbi->s_blocks_per_group == clustersize << 3)
+                set_opt2(sb, STD_GROUP_SIZE);
        /*
         * Test whether we have more sectors than will fit in sector_t,
         * and whether the max offset is addressable by the page cache.
@@ -3763,7 +3795,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_err_report.data = (unsigned long) sb;
        /* Register extent status tree shrinker */
-        ext4_es_register_shrinker(sb);
+        ext4_es_register_shrinker(sbi);
        err = percpu_counter_init(&sbi->s_freeclusters_counter,
                        ext4_count_free_clusters(sb));
@@ -3787,7 +3819,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        sbi->s_stripe = ext4_get_stripe_size(sbi);
-        sbi->s_max_writeback_mb_bump = 128;
        sbi->s_extent_max_zeroout_kb = 32;
        /*
@@ -3915,12 +3946,20 @@ no_journal:
         * The maximum number of concurrent works can be high and
         * concurrency isn't really necessary.  Limit it to 1.
         */
-        EXT4_SB(sb)->dio_unwritten_wq =
+        EXT4_SB(sb)->rsv_conversion_wq =
-                alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+                alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
-        if (!EXT4_SB(sb)->dio_unwritten_wq) {
+        if (!EXT4_SB(sb)->rsv_conversion_wq) {
-                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+                printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
                ret = -ENOMEM;
-                goto failed_mount_wq;
+                goto failed_mount4;
+        }
+        EXT4_SB(sb)->unrsv_conversion_wq =
+                alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+        if (!EXT4_SB(sb)->unrsv_conversion_wq) {
+                printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+                ret = -ENOMEM;
+                goto failed_mount4;
        }
        /*
@@ -4074,14 +4113,17 @@ failed_mount4a:
        sb->s_root = NULL;
 failed_mount4:
        ext4_msg(sb, KERN_ERR, "mount failed");
-        destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+        if (EXT4_SB(sb)->rsv_conversion_wq)
+                destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+        if (EXT4_SB(sb)->unrsv_conversion_wq)
+                destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
 failed_mount_wq:
        if (sbi->s_journal) {
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
 failed_mount3:
-        ext4_es_unregister_shrinker(sb);
+        ext4_es_unregister_shrinker(sbi);
        del_timer(&sbi->s_err_report);
        if (sbi->s_flex_groups)
                ext4_kvfree(sbi->s_flex_groups);
@@ -4517,19 +4559,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 {
        int ret = 0;
        tid_t target;
+        bool needs_barrier = false;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        trace_ext4_sync_fs(sb, wait);
-        flush_workqueue(sbi->dio_unwritten_wq);
+        flush_workqueue(sbi->rsv_conversion_wq);
+        flush_workqueue(sbi->unrsv_conversion_wq);
        /*
         * Writeback quota in non-journalled quota case - journalled quota has
         * no dirty dquots
         */
        dquot_writeback_dquots(sb, -1);
+        /*
+         * Data writeback is possible w/o journal transaction, so barrier must
+         * being sent at the end of the function. But we can skip it if
+         * transaction_commit will do it for us.
+         */
+        target = jbd2_get_latest_transaction(sbi->s_journal);
+        if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+            !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+                needs_barrier = true;
        if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
                if (wait)
-                        jbd2_log_wait_commit(sbi->s_journal, target);
+                        ret = jbd2_log_wait_commit(sbi->s_journal, target);
        }
+        if (needs_barrier) {
+                int err;
+                err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+                if (!ret)
+                        ret = err;
+        }
+        return ret;
+}
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
+{
+        int ret = 0;
+        trace_ext4_sync_fs(sb, wait);
+        flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+        flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
+        dquot_writeback_dquots(sb, -1);
+        if (wait && test_opt(sb, BARRIER))
+                ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
        return ret;
 }
@@ -4652,6 +4727,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                goto restore_opts;
        }
+        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+                        ext4_msg(sb, KERN_ERR, "can't mount with "
+                                 "both data=journal and delalloc");
+                        err = -EINVAL;
+                        goto restore_opts;
+                }
+                if (test_opt(sb, DIOREAD_NOLOCK)) {
+                        ext4_msg(sb, KERN_ERR, "can't mount with "
+                                 "both data=journal and dioread_nolock");
+                        err = -EINVAL;
+                        goto restore_opts;
+                }
+        }
        if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
                ext4_abort(sb, "Abort forced by user");
@@ -5406,6 +5496,7 @@ static void __exit ext4_exit_fs(void)
        kset_unregister(ext4_kset);
        ext4_exit_system_zone();
        ext4_exit_pageio();
+        ext4_exit_es();
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index fd27e7e6326e..e06e0995e00f 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -51,3 +51,15 @@ config F2FS_FS_POSIX_ACL
          Linux website <http://acl.bestbits.at/>.
          If you don't know what Access Control Lists are, say N
+config F2FS_FS_SECURITY
+        bool "F2FS Security Labels"
+        depends on F2FS_FS_XATTR
+        help
+          Security labels provide an access control facility to support Linux
+          Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+          Linux. This option enables an extended attribute handler for file
+          security labels in the f2fs filesystem, so that it requires enabling
+          the extended attribute support in advance.
+          If you are not using a security module, say N.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 44abc2f286e0..b7826ec1b470 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -250,7 +250,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                }
        }
-        error = f2fs_setxattr(inode, name_index, "", value, size);
+        error = f2fs_setxattr(inode, name_index, "", value, size, NULL);
        kfree(value);
        if (!error)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b1de01da1a40..66a6b85a51d8 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -357,8 +357,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
        unsigned long blk_size = sbi->blocksize;
        struct f2fs_checkpoint *cp_block;
        unsigned long long cur_version = 0, pre_version = 0;
-        unsigned int crc = 0;
        size_t crc_offset;
+        __u32 crc = 0;
        /* Read the 1st cp block in this CP pack */
        cp_page_1 = get_meta_page(sbi, cp_addr);
@@ -369,7 +369,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
        if (crc_offset >= blk_size)
                goto invalid_cp1;
-        crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+        crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
                goto invalid_cp1;
@@ -384,7 +384,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
        if (crc_offset >= blk_size)
                goto invalid_cp2;
-        crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+        crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
                goto invalid_cp2;
@@ -450,13 +450,30 @@ fail_no_cp:
        return -EINVAL;
 }
-void set_dirty_dir_page(struct inode *inode, struct page *page)
+static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct list_head *head = &sbi->dir_inode_list;
-        struct dir_inode_entry *new;
        struct list_head *this;
+        list_for_each(this, head) {
+                struct dir_inode_entry *entry;
+                entry = list_entry(this, struct dir_inode_entry, list);
+                if (entry->inode == inode)
+                        return -EEXIST;
+        }
+        list_add_tail(&new->list, head);
+#ifdef CONFIG_F2FS_STAT_FS
+        sbi->n_dirty_dirs++;
+#endif
+        return 0;
+}
+void set_dirty_dir_page(struct inode *inode, struct page *page)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct dir_inode_entry *new;
        if (!S_ISDIR(inode->i_mode))
                return;
 retry:
@@ -469,23 +486,31 @@ retry:
        INIT_LIST_HEAD(&new->list);
        spin_lock(&sbi->dir_inode_lock);
-        list_for_each(this, head) {
+        if (__add_dirty_inode(inode, new))
-                struct dir_inode_entry *entry;
+                kmem_cache_free(inode_entry_slab, new);
-                entry = list_entry(this, struct dir_inode_entry, list);
-                if (entry->inode == inode) {
-                        kmem_cache_free(inode_entry_slab, new);
-                        goto out;
-                }
-        }
-        list_add_tail(&new->list, head);
-        sbi->n_dirty_dirs++;
-        BUG_ON(!S_ISDIR(inode->i_mode));
-out:
        inc_page_count(sbi, F2FS_DIRTY_DENTS);
        inode_inc_dirty_dents(inode);
        SetPagePrivate(page);
+        spin_unlock(&sbi->dir_inode_lock);
+}
+void add_dirty_dir_inode(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct dir_inode_entry *new;
+retry:
+        new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+        if (!new) {
+                cond_resched();
+                goto retry;
+        }
+        new->inode = inode;
+        INIT_LIST_HEAD(&new->list);
+        spin_lock(&sbi->dir_inode_lock);
+        if (__add_dirty_inode(inode, new))
+                kmem_cache_free(inode_entry_slab, new);
        spin_unlock(&sbi->dir_inode_lock);
 }
@@ -499,8 +524,10 @@ void remove_dirty_dir_inode(struct inode *inode)
                return;
        spin_lock(&sbi->dir_inode_lock);
-        if (atomic_read(&F2FS_I(inode)->dirty_dents))
+        if (atomic_read(&F2FS_I(inode)->dirty_dents)) {
-                goto out;
+                spin_unlock(&sbi->dir_inode_lock);
+                return;
+        }
        list_for_each(this, head) {
                struct dir_inode_entry *entry;
@@ -508,12 +535,38 @@ void remove_dirty_dir_inode(struct inode *inode)
                if (entry->inode == inode) {
                        list_del(&entry->list);
                        kmem_cache_free(inode_entry_slab, entry);
+#ifdef CONFIG_F2FS_STAT_FS
                        sbi->n_dirty_dirs--;
+#endif
+                        break;
+                }
+        }
+        spin_unlock(&sbi->dir_inode_lock);
+        /* Only from the recovery routine */
+        if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
+                clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
+                iput(inode);
+        }
+}
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+        struct list_head *head = &sbi->dir_inode_list;
+        struct list_head *this;
+        struct inode *inode = NULL;
+        spin_lock(&sbi->dir_inode_lock);
+        list_for_each(this, head) {
+                struct dir_inode_entry *entry;
+                entry = list_entry(this, struct dir_inode_entry, list);
+                if (entry->inode->i_ino == ino) {
+                        inode = entry->inode;
                        break;
                }
        }
-out:
        spin_unlock(&sbi->dir_inode_lock);
+        return inode;
 }
 void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
@@ -595,7 +648,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        block_t start_blk;
        struct page *cp_page;
        unsigned int data_sum_blocks, orphan_blocks;
-        unsigned int crc32 = 0;
+        __u32 crc32 = 0;
        void *kaddr;
        int i;
@@ -664,8 +717,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
        crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
-        *(__le32 *)((unsigned char *)ckpt +
+        *((__le32 *)((unsigned char *)ckpt +
-                                le32_to_cpu(ckpt->checksum_offset))
+                                le32_to_cpu(ckpt->checksum_offset)))
                                = cpu_to_le32(crc32);
        start_blk = __start_cp_addr(sbi);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 91ff93b0b0f4..035f9a345cdf 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -68,7 +68,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
                                        struct buffer_head *bh_result)
 {
        struct f2fs_inode_info *fi = F2FS_I(inode);
+#ifdef CONFIG_F2FS_STAT_FS
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+#endif
        pgoff_t start_fofs, end_fofs;
        block_t start_blkaddr;
@@ -78,7 +80,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
                return 0;
        }
+#ifdef CONFIG_F2FS_STAT_FS
        sbi->total_hit_ext++;
+#endif
        start_fofs = fi->ext.fofs;
        end_fofs = fi->ext.fofs + fi->ext.len - 1;
        start_blkaddr = fi->ext.blk_addr;
@@ -96,7 +100,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
                else
                        bh_result->b_size = UINT_MAX;
+#ifdef CONFIG_F2FS_STAT_FS
                sbi->read_hit_ext++;
+#endif
                read_unlock(&fi->ext.ext_lock);
                return 1;
        }
@@ -199,7 +205,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
        if (dn.data_blkaddr == NEW_ADDR)
                return ERR_PTR(-EINVAL);
-        page = grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
        if (!page)
                return ERR_PTR(-ENOMEM);
@@ -233,18 +239,23 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
        struct page *page;
        int err;
+repeat:
+        page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+        if (!page)
+                return ERR_PTR(-ENOMEM);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
        err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
-        if (err)
+        if (err) {
+                f2fs_put_page(page, 1);
                return ERR_PTR(err);
+        }
        f2fs_put_dnode(&dn);
-        if (dn.data_blkaddr == NULL_ADDR)
+        if (dn.data_blkaddr == NULL_ADDR) {
+                f2fs_put_page(page, 1);
                return ERR_PTR(-ENOENT);
-repeat:
+        }
-        page = grab_cache_page(mapping, index);
-        if (!page)
-                return ERR_PTR(-ENOMEM);
        if (PageUptodate(page))
                return page;
@@ -274,9 +285,10 @@ repeat:
 *
 * Also, caller should grab and release a mutex by calling mutex_lock_op() and
 * mutex_unlock_op().
+ * Note that, npage is set only by make_empty_dir.
 */
-struct page *get_new_data_page(struct inode *inode, pgoff_t index,
+struct page *get_new_data_page(struct inode *inode,
-                                                bool new_i_size)
+                struct page *npage, pgoff_t index, bool new_i_size)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct address_space *mapping = inode->i_mapping;
@@ -284,18 +296,20 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
        struct dnode_of_data dn;
        int err;
-        set_new_dnode(&dn, inode, NULL, NULL, 0);
+        set_new_dnode(&dn, inode, npage, npage, 0);
        err = get_dnode_of_data(&dn, index, ALLOC_NODE);
        if (err)
                return ERR_PTR(err);
        if (dn.data_blkaddr == NULL_ADDR) {
                if (reserve_new_block(&dn)) {
-                        f2fs_put_dnode(&dn);
+                        if (!npage)
+                                f2fs_put_dnode(&dn);
                        return ERR_PTR(-ENOSPC);
                }
        }
-        f2fs_put_dnode(&dn);
+        if (!npage)
+                f2fs_put_dnode(&dn);
 repeat:
        page = grab_cache_page(mapping, index);
        if (!page)
@@ -325,6 +339,8 @@ repeat:
        if (new_i_size &&
                i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
                i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+                /* Only the directory inode sets new_i_size */
+                set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
                mark_inode_dirty_sync(inode);
        }
        return page;
@@ -481,8 +497,9 @@ int do_write_data_page(struct page *page)
         * If current allocation needs SSR,
         * it had better in-place writes for updated data.
         */
-        if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
+        if (unlikely(old_blk_addr != NEW_ADDR &&
-                                need_inplace_update(inode)) {
+                        !is_cold_data(page) &&
+                        need_inplace_update(inode))) {
                rewrite_data_page(F2FS_SB(inode->i_sb), page,
                                                old_blk_addr);
        } else {
@@ -684,6 +701,27 @@ err:
        return err;
 }
+static int f2fs_write_end(struct file *file,
+                        struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        struct inode *inode = page->mapping->host;
+        SetPageUptodate(page);
+        set_page_dirty(page);
+        if (pos + copied > i_size_read(inode)) {
+                i_size_write(inode, pos + copied);
+                mark_inode_dirty(inode);
+                update_inode_page(inode);
+        }
+        unlock_page(page);
+        page_cache_release(page);
+        return copied;
+}
 static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
                const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
@@ -698,7 +736,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
                                                  get_data_block_ro);
 }
-static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
+                                      unsigned int length)
 {
        struct inode *inode = page->mapping->host;
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -740,7 +779,7 @@ const struct address_space_operations f2fs_dblock_aops = {
        .writepage      = f2fs_write_data_page,
        .writepages     = f2fs_write_data_pages,
        .write_begin    = f2fs_write_begin,
-        .write_end      = nobh_write_end,
+        .write_end      = f2fs_write_end,
        .set_page_dirty = f2fs_set_data_page_dirty,
        .invalidatepage = f2fs_invalidate_data_page,
        .releasepage    = f2fs_release_data_page,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8d9943786c31..0d6c6aafb235 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -175,12 +175,12 @@ get_cache:
 static int stat_show(struct seq_file *s, void *v)
 {
-        struct f2fs_stat_info *si, *next;
+        struct f2fs_stat_info *si;
        int i = 0;
        int j;
        mutex_lock(&f2fs_stat_mutex);
-        list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
+        list_for_each_entry(si, &f2fs_stat_list, stat_list) {
                char devname[BDEVNAME_SIZE];
                update_general_status(si->sbi);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 1ac6b93036b7..62f0d5977c64 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -13,6 +13,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "acl.h"
+#include "xattr.h"
 static unsigned long dir_blocks(struct inode *inode)
 {
@@ -215,9 +216,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
 {
-        struct page *page = NULL;
+        struct page *page;
-        struct f2fs_dir_entry *de = NULL;
+        struct f2fs_dir_entry *de;
-        struct f2fs_dentry_block *dentry_blk = NULL;
+        struct f2fs_dentry_block *dentry_blk;
        page = get_lock_data_page(dir, 0);
        if (IS_ERR(page))
@@ -264,15 +265,10 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
        f2fs_put_page(page, 1);
 }
-void init_dent_inode(const struct qstr *name, struct page *ipage)
+static void init_dent_inode(const struct qstr *name, struct page *ipage)
 {
        struct f2fs_node *rn;
-        if (IS_ERR(ipage))
-                return;
-        wait_on_page_writeback(ipage);
        /* copy name info. to this inode page */
        rn = (struct f2fs_node *)page_address(ipage);
        rn->i.i_namelen = cpu_to_le32(name->len);
@@ -280,14 +276,15 @@ void init_dent_inode(const struct qstr *name, struct page *ipage)
        set_page_dirty(ipage);
 }
-static int make_empty_dir(struct inode *inode, struct inode *parent)
+static int make_empty_dir(struct inode *inode,
+                struct inode *parent, struct page *page)
 {
        struct page *dentry_page;
        struct f2fs_dentry_block *dentry_blk;
        struct f2fs_dir_entry *de;
        void *kaddr;
-        dentry_page = get_new_data_page(inode, 0, true);
+        dentry_page = get_new_data_page(inode, page, 0, true);
        if (IS_ERR(dentry_page))
                return PTR_ERR(dentry_page);
@@ -317,63 +314,76 @@ static int make_empty_dir(struct inode *inode, struct inode *parent)
        return 0;
 }
-static int init_inode_metadata(struct inode *inode,
+static struct page *init_inode_metadata(struct inode *inode,
                struct inode *dir, const struct qstr *name)
 {
+        struct page *page;
+        int err;
        if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
-                int err;
+                page = new_inode_page(inode, name);
-                err = new_inode_page(inode, name);
+                if (IS_ERR(page))
-                if (err)
+                        return page;
-                        return err;
                if (S_ISDIR(inode->i_mode)) {
-                        err = make_empty_dir(inode, dir);
+                        err = make_empty_dir(inode, dir, page);
-                        if (err) {
+                        if (err)
-                                remove_inode_page(inode);
+                                goto error;
-                                return err;
-                        }
                }
                err = f2fs_init_acl(inode, dir);
-                if (err) {
+                if (err)
-                        remove_inode_page(inode);
+                        goto error;
-                        return err;
-                }
+                err = f2fs_init_security(inode, dir, name, page);
+                if (err)
+                        goto error;
+                wait_on_page_writeback(page);
        } else {
-                struct page *ipage;
+                page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
-                ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+                if (IS_ERR(page))
-                if (IS_ERR(ipage))
+                        return page;
-                        return PTR_ERR(ipage);
-                set_cold_node(inode, ipage);
+                wait_on_page_writeback(page);
-                init_dent_inode(name, ipage);
+                set_cold_node(inode, page);
-                f2fs_put_page(ipage, 1);
        }
+        init_dent_inode(name, page);
+        /*
+         * This file should be checkpointed during fsync.
+         * We lost i_pino from now on.
+         */
        if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+                file_lost_pino(inode);
                inc_nlink(inode);
-                update_inode_page(inode);
        }
-        return 0;
+        return page;
+error:
+        f2fs_put_page(page, 1);
+        remove_inode_page(inode);
+        return ERR_PTR(err);
 }
 static void update_parent_metadata(struct inode *dir, struct inode *inode,
                                                unsigned int current_depth)
 {
-        bool need_dir_update = false;
        if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
                if (S_ISDIR(inode->i_mode)) {
                        inc_nlink(dir);
-                        need_dir_update = true;
+                        set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
                }
                clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
        }
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        if (F2FS_I(dir)->i_current_depth != current_depth) {
                F2FS_I(dir)->i_current_depth = current_depth;
-                need_dir_update = true;
+                set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
        }
-        if (need_dir_update)
+        if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
                update_inode_page(dir);
        else
                mark_inode_dirty(dir);
@@ -423,6 +433,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
        struct page *dentry_page = NULL;
        struct f2fs_dentry_block *dentry_blk = NULL;
        int slots = GET_DENTRY_SLOTS(namelen);
+        struct page *page;
        int err = 0;
        int i;
@@ -448,7 +459,7 @@ start:
        bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
        for (block = bidx; block <= (bidx + nblock - 1); block++) {
-                dentry_page = get_new_data_page(dir, block, true);
+                dentry_page = get_new_data_page(dir, NULL, block, true);
                if (IS_ERR(dentry_page))
                        return PTR_ERR(dentry_page);
@@ -465,12 +476,13 @@ start:
        ++level;
        goto start;
 add_dentry:
-        err = init_inode_metadata(inode, dir, name);
-        if (err)
-                goto fail;
        wait_on_page_writeback(dentry_page);
+        page = init_inode_metadata(inode, dir, name);
+        if (IS_ERR(page)) {
+                err = PTR_ERR(page);
+                goto fail;
+        }
        de = &dentry_blk->dentry[bit_pos];
        de->hash_code = dentry_hash;
        de->name_len = cpu_to_le16(namelen);
@@ -481,11 +493,14 @@ add_dentry:
                test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
        set_page_dirty(dentry_page);
-        update_parent_metadata(dir, inode, current_depth);
+        /* we don't need to mark_inode_dirty now */
-        /* update parent inode number before releasing dentry page */
        F2FS_I(inode)->i_pino = dir->i_ino;
+        update_inode(inode, page);
+        f2fs_put_page(page, 1);
+        update_parent_metadata(dir, inode, current_depth);
 fail:
+        clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
        kunmap(dentry_page);
        f2fs_put_page(dentry_page, 1);
        return err;
@@ -591,34 +606,26 @@ bool f2fs_empty_dir(struct inode *dir)
        return true;
 }
-static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 {
-        unsigned long pos = file->f_pos;
        struct inode *inode = file_inode(file);
        unsigned long npages = dir_blocks(inode);
-        unsigned char *types = NULL;
+        unsigned int bit_pos = 0;
-        unsigned int bit_pos = 0, start_bit_pos = 0;
-        int over = 0;
        struct f2fs_dentry_block *dentry_blk = NULL;
        struct f2fs_dir_entry *de = NULL;
        struct page *dentry_page = NULL;
-        unsigned int n = 0;
+        unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
        unsigned char d_type = DT_UNKNOWN;
-        int slots;
-        types = f2fs_filetype_table;
+        bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
-        bit_pos = (pos % NR_DENTRY_IN_BLOCK);
-        n = (pos / NR_DENTRY_IN_BLOCK);
        for ( ; n < npages; n++) {
                dentry_page = get_lock_data_page(inode, n);
                if (IS_ERR(dentry_page))
                        continue;
-                start_bit_pos = bit_pos;
                dentry_blk = kmap(dentry_page);
                while (bit_pos < NR_DENTRY_IN_BLOCK) {
-                        d_type = DT_UNKNOWN;
                        bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
                                                        NR_DENTRY_IN_BLOCK,
                                                        bit_pos);
@@ -626,28 +633,26 @@ static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
                                break;
                        de = &dentry_blk->dentry[bit_pos];
-                        if (types && de->file_type < F2FS_FT_MAX)
+                        if (de->file_type < F2FS_FT_MAX)
-                                d_type = types[de->file_type];
+                                d_type = f2fs_filetype_table[de->file_type];
+                        else
-                        over = filldir(dirent,
+                                d_type = DT_UNKNOWN;
+                        if (!dir_emit(ctx,
                                        dentry_blk->filename[bit_pos],
                                        le16_to_cpu(de->name_len),
-                                        (n * NR_DENTRY_IN_BLOCK) + bit_pos,
+                                        le32_to_cpu(de->ino), d_type))
-                                        le32_to_cpu(de->ino), d_type);
+                                goto stop;
-                        if (over) {
-                                file->f_pos += bit_pos - start_bit_pos;
+                        bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
-                                goto success;
+                        ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos;
-                        }
-                        slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
-                        bit_pos += slots;
                }
                bit_pos = 0;
-                file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK;
+                ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
                kunmap(dentry_page);
                f2fs_put_page(dentry_page, 1);
                dentry_page = NULL;
        }
-success:
+stop:
        if (dentry_page && !IS_ERR(dentry_page)) {
                kunmap(dentry_page);
                f2fs_put_page(dentry_page, 1);
@@ -659,7 +664,7 @@ success:
 const struct file_operations f2fs_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = f2fs_readdir,
+        .iterate        = f2fs_readdir,
        .fsync          = f2fs_sync_file,
        .unlocked_ioctl = f2fs_ioctl,
 };
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 20aab02f2a42..467d42d65c48 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -37,21 +37,35 @@
                typecheck(unsigned long long, b) &&                     \
                ((long long)((a) - (b)) > 0))
-typedef u64 block_t;
+typedef u32 block_t;    /*
+                         * should not change u32, since it is the on-disk block
+                         * address format, __le32.
+                         */
 typedef u32 nid_t;
 struct f2fs_mount_info {
        unsigned int    opt;
 };
-static inline __u32 f2fs_crc32(void *buff, size_t len)
+#define CRCPOLY_LE 0xedb88320
+static inline __u32 f2fs_crc32(void *buf, size_t len)
 {
-        return crc32_le(F2FS_SUPER_MAGIC, buff, len);
+        unsigned char *p = (unsigned char *)buf;
+        __u32 crc = F2FS_SUPER_MAGIC;
+        int i;
+        while (len--) {
+                crc ^= *p++;
+                for (i = 0; i < 8; i++)
+                        crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
+        }
+        return crc;
 }
-static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size)
+static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
 {
-        return f2fs_crc32(buff, buff_size) == blk_crc;
+        return f2fs_crc32(buf, buf_size) == blk_crc;
 }
 /*
@@ -148,7 +162,7 @@ struct extent_info {
 * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
 */
 #define FADVISE_COLD_BIT        0x01
-#define FADVISE_CP_BIT          0x02
+#define FADVISE_LOST_PINO_BIT   0x02
 struct f2fs_inode_info {
        struct inode vfs_inode;         /* serve a vfs inode */
@@ -369,7 +383,6 @@ struct f2fs_sb_info {
        /* for directory inode management */
        struct list_head dir_inode_list;        /* dir inode list */
        spinlock_t dir_inode_lock;              /* for dir inode list lock */
-        unsigned int n_dirty_dirs;              /* # of dir inodes */
        /* basic file system units */
        unsigned int log_sectors_per_block;     /* log2 sectors per block */
@@ -406,12 +419,15 @@ struct f2fs_sb_info {
         * for stat information.
         * one is for the LFS mode, and the other is for the SSR mode.
         */
+#ifdef CONFIG_F2FS_STAT_FS
        struct f2fs_stat_info *stat_info;       /* FS status information */
        unsigned int segment_count[2];          /* # of allocated segments */
        unsigned int block_count[2];            /* # of allocated blocks */
-        unsigned int last_victim[2];            /* last victim segment # */
        int total_hit_ext, read_hit_ext;        /* extent cache hit ratio */
        int bg_gc;                              /* background gc calls */
+        unsigned int n_dirty_dirs;              /* # of dir inodes */
+#endif
+        unsigned int last_victim[2];            /* last victim segment # */
        spinlock_t stat_lock;                   /* lock for stat operations */
 };
@@ -495,9 +511,17 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
 static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
 {
-        int i = 0;
+        int i;
-        for (; i < NR_GLOBAL_LOCKS; i++)
-                mutex_lock(&sbi->fs_lock[i]);
+        for (i = 0; i < NR_GLOBAL_LOCKS; i++) {
+                /*
+                 * This is the only time we take multiple fs_lock[]
+                 * instances; the order is immaterial since we
+                 * always hold cp_mutex, which serializes multiple
+                 * such operations.
+                 */
+                mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex);
+        }
 }
 static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
@@ -843,9 +867,12 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr)
 /* used for f2fs_inode_info->flags */
 enum {
        FI_NEW_INODE,           /* indicate newly allocated inode */
+        FI_DIRTY_INODE,         /* indicate inode is dirty or not */
        FI_INC_LINK,            /* need to increment i_nlink */
        FI_ACL_MODE,            /* indicate acl mode */
        FI_NO_ALLOC,            /* should not allocate any blocks */
+        FI_UPDATE_DIR,          /* should update inode block for consistency */
+        FI_DELAY_IPUT,          /* used for the recovery */
 };
 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -878,14 +905,21 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
        return 0;
 }
+static inline int f2fs_readonly(struct super_block *sb)
+{
+        return sb->s_flags & MS_RDONLY;
+}
 /*
 * file.c
 */
 int f2fs_sync_file(struct file *, loff_t, loff_t, int);
 void truncate_data_blocks(struct dnode_of_data *);
 void f2fs_truncate(struct inode *);
+int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int f2fs_setattr(struct dentry *, struct iattr *);
 int truncate_hole(struct inode *, pgoff_t, pgoff_t);
+int truncate_data_blocks_range(struct dnode_of_data *, int);
 long f2fs_ioctl(struct file *, unsigned int, unsigned long);
 long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -913,7 +947,6 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
 ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
 void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
                                struct page *, struct inode *);
-void init_dent_inode(const struct qstr *, struct page *);
 int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
 void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
 int f2fs_make_empty(struct inode *, struct inode *);
@@ -948,8 +981,8 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
 int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
 int remove_inode_page(struct inode *);
-int new_inode_page(struct inode *, const struct qstr *);
+struct page *new_inode_page(struct inode *, const struct qstr *);
-struct page *new_node_page(struct dnode_of_data *, unsigned int);
+struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
 void ra_node_page(struct f2fs_sb_info *, nid_t);
 struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_node_page_ra(struct page *, int);
@@ -974,7 +1007,6 @@ void destroy_node_manager_caches(void);
 */
 void f2fs_balance_fs(struct f2fs_sb_info *);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
-void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
 void clear_prefree_segments(struct f2fs_sb_info *);
 int npages_for_summary_flush(struct f2fs_sb_info *);
 void allocate_new_segments(struct f2fs_sb_info *);
@@ -1011,7 +1043,9 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
 int recover_orphan_inodes(struct f2fs_sb_info *);
 int get_valid_checkpoint(struct f2fs_sb_info *);
 void set_dirty_dir_page(struct inode *, struct page *);
+void add_dirty_dir_inode(struct inode *);
 void remove_dirty_dir_inode(struct inode *);
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t);
 void sync_dirty_dir_inodes(struct f2fs_sb_info *);
 void write_checkpoint(struct f2fs_sb_info *, bool);
 void init_orphan_info(struct f2fs_sb_info *);
@@ -1025,7 +1059,7 @@ int reserve_new_block(struct dnode_of_data *);
 void update_extent_cache(block_t, struct dnode_of_data *);
 struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
-struct page *get_new_data_page(struct inode *, pgoff_t, bool);
+struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
 int do_write_data_page(struct page *);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1cae864f8dfc..d2d2b7dbdcc1 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -63,9 +63,10 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
        f2fs_put_dnode(&dn);
        mutex_unlock_op(sbi, ilock);
+        file_update_time(vma->vm_file);
        lock_page(page);
        if (page->mapping != inode->i_mapping ||
-                        page_offset(page) >= i_size_read(inode) ||
+                        page_offset(page) > i_size_read(inode) ||
                        !PageUptodate(page)) {
                unlock_page(page);
                err = -EFAULT;
@@ -76,10 +77,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
         * check to see if the page is mapped already (no holes)
         */
        if (PageMappedToDisk(page))
-                goto out;
+                goto mapped;
-        /* fill the page */
-        wait_on_page_writeback(page);
        /* page is wholly or partially inside EOF */
        if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
@@ -90,7 +88,9 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
        set_page_dirty(page);
        SetPageUptodate(page);
-        file_update_time(vma->vm_file);
+mapped:
+        /* fill the page */
+        wait_on_page_writeback(page);
 out:
        sb_end_pagefault(inode->i_sb);
        return block_page_mkwrite_return(err);
@@ -102,6 +102,24 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
        .remap_pages    = generic_file_remap_pages,
 };
+static int get_parent_ino(struct inode *inode, nid_t *pino)
+{
+        struct dentry *dentry;
+        inode = igrab(inode);
+        dentry = d_find_any_alias(inode);
+        iput(inode);
+        if (!dentry)
+                return 0;
+        inode = igrab(dentry->d_parent->d_inode);
+        dput(dentry);
+        *pino = inode->i_ino;
+        iput(inode);
+        return 1;
+}
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
        struct inode *inode = file->f_mapping->host;
@@ -114,7 +132,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                .for_reclaim = 0,
        };
-        if (inode->i_sb->s_flags & MS_RDONLY)
+        if (f2fs_readonly(inode->i_sb))
                return 0;
        trace_f2fs_sync_file_enter(inode);
@@ -134,7 +152,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
                need_cp = true;
-        else if (is_cp_file(inode))
+        else if (file_wrong_pino(inode))
                need_cp = true;
        else if (!space_for_roll_forward(sbi))
                need_cp = true;
@@ -142,11 +160,23 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                need_cp = true;
        if (need_cp) {
+                nid_t pino;
                /* all the dirty node pages should be flushed for POR */
                ret = f2fs_sync_fs(inode->i_sb, 1);
+                if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
+                                        get_parent_ino(inode, &pino)) {
+                        F2FS_I(inode)->i_pino = pino;
+                        file_got_pino(inode);
+                        mark_inode_dirty_sync(inode);
+                        ret = f2fs_write_inode(inode, NULL);
+                        if (ret)
+                                goto out;
+                }
        } else {
                /* if there is no written node page, write its inode page */
                while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+                        mark_inode_dirty_sync(inode);
                        ret = f2fs_write_inode(inode, NULL);
                        if (ret)
                                goto out;
@@ -168,7 +198,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
-static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 {
        int nr_free = 0, ofs = dn->ofs_in_node;
        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
@@ -185,10 +215,10 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
                update_extent_cache(NULL_ADDR, dn);
                invalidate_blocks(sbi, blkaddr);
-                dec_valid_block_count(sbi, dn->inode, 1);
                nr_free++;
        }
        if (nr_free) {
+                dec_valid_block_count(sbi, dn->inode, nr_free);
                set_page_dirty(dn->node_page);
                sync_inode_page(dn);
        }
@@ -291,7 +321,7 @@ void f2fs_truncate(struct inode *inode)
        }
 }
-static int f2fs_getattr(struct vfsmount *mnt,
+int f2fs_getattr(struct vfsmount *mnt,
                         struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
@@ -387,7 +417,7 @@ static void fill_zero(struct inode *inode, pgoff_t index,
        f2fs_balance_fs(sbi);
        ilock = mutex_lock_op(sbi);
-        page = get_new_data_page(inode, index, false);
+        page = get_new_data_page(inode, NULL, index, false);
        mutex_unlock_op(sbi, ilock);
        if (!IS_ERR(page)) {
@@ -575,10 +605,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        int ret;
        switch (cmd) {
-        case FS_IOC_GETFLAGS:
+        case F2FS_IOC_GETFLAGS:
                flags = fi->i_flags & FS_FL_USER_VISIBLE;
                return put_user(flags, (int __user *) arg);
-        case FS_IOC_SETFLAGS:
+        case F2FS_IOC_SETFLAGS:
        {
                unsigned int oldflags;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 14961593e93c..35f9b1a196aa 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -76,7 +76,9 @@ static int gc_thread_func(void *data)
                else
                        wait_ms = increase_sleep_time(wait_ms);
+#ifdef CONFIG_F2FS_STAT_FS
                sbi->bg_gc++;
+#endif
                /* if return value is not zero, no victim was selected */
                if (f2fs_gc(sbi))
@@ -89,23 +91,28 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
 {
        struct f2fs_gc_kthread *gc_th;
        dev_t dev = sbi->sb->s_bdev->bd_dev;
+        int err = 0;
        if (!test_opt(sbi, BG_GC))
-                return 0;
+                goto out;
        gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
-        if (!gc_th)
+        if (!gc_th) {
-                return -ENOMEM;
+                err = -ENOMEM;
+                goto out;
+        }
        sbi->gc_thread = gc_th;
        init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
        sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
                        "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
        if (IS_ERR(gc_th->f2fs_gc_task)) {
+                err = PTR_ERR(gc_th->f2fs_gc_task);
                kfree(gc_th);
                sbi->gc_thread = NULL;
-                return -ENOMEM;
        }
-        return 0;
+out:
+        return err;
 }
 void stop_gc_thread(struct f2fs_sb_info *sbi)
@@ -234,14 +241,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        struct victim_sel_policy p;
-        unsigned int secno;
+        unsigned int secno, max_cost;
        int nsearched = 0;
        p.alloc_mode = alloc_mode;
        select_policy(sbi, gc_type, type, &p);
        p.min_segno = NULL_SEGNO;
-        p.min_cost = get_max_cost(sbi, &p);
+        p.min_cost = max_cost = get_max_cost(sbi, &p);
        mutex_lock(&dirty_i->seglist_lock);
@@ -280,7 +287,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
                        p.min_cost = cost;
                }
-                if (cost == get_max_cost(sbi, &p))
+                if (cost == max_cost)
                        continue;
                if (nsearched++ >= MAX_VICTIM_SEARCH) {
@@ -288,8 +295,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
                        break;
                }
        }
-got_it:
        if (p.min_segno != NULL_SEGNO) {
+got_it:
                if (p.alloc_mode == LFS) {
                        secno = GET_SECNO(sbi, p.min_segno);
                        if (gc_type == FG_GC)
@@ -314,28 +321,21 @@ static const struct victim_selection default_v_ops = {
 static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
 {
-        struct list_head *this;
        struct inode_entry *ie;
-        list_for_each(this, ilist) {
+        list_for_each_entry(ie, ilist, list)
-                ie = list_entry(this, struct inode_entry, list);
                if (ie->inode->i_ino == ino)
                        return ie->inode;
-        }
        return NULL;
 }
 static void add_gc_inode(struct inode *inode, struct list_head *ilist)
 {
-        struct list_head *this;
+        struct inode_entry *new_ie;
-        struct inode_entry *new_ie, *ie;
-        list_for_each(this, ilist) {
+        if (inode == find_gc_inode(inode->i_ino, ilist)) {
-                ie = list_entry(this, struct inode_entry, list);
+                iput(inode);
-                if (ie->inode == inode) {
+                return;
-                        iput(inode);
-                        return;
-                }
        }
 repeat:
        new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 91ac7f9d88ee..2b2d45d19e3e 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -109,12 +109,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
        ret = do_read_inode(inode);
        if (ret)
                goto bad_inode;
-        if (!sbi->por_doing && inode->i_nlink == 0) {
-                ret = -ENOENT;
-                goto bad_inode;
-        }
 make_now:
        if (ino == F2FS_NODE_INO(sbi)) {
                inode->i_mapping->a_ops = &f2fs_node_aops;
@@ -130,8 +124,7 @@ make_now:
                inode->i_op = &f2fs_dir_inode_operations;
                inode->i_fop = &f2fs_dir_operations;
                inode->i_mapping->a_ops = &f2fs_dblock_aops;
-                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE |
+                mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
-                                __GFP_ZERO);
        } else if (S_ISLNK(inode->i_mode)) {
                inode->i_op = &f2fs_symlink_inode_operations;
                inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -199,6 +192,7 @@ void update_inode(struct inode *inode, struct page *node_page)
        set_cold_node(inode, node_page);
        set_page_dirty(node_page);
+        clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
 }
 int update_inode_page(struct inode *inode)
@@ -224,6 +218,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
                        inode->i_ino == F2FS_META_INO(sbi))
                return 0;
+        if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
+                return 0;
        if (wbc)
                f2fs_balance_fs(sbi);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 47abc9722b17..64c07169df05 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -112,7 +112,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
        int count = le32_to_cpu(sbi->raw_super->extension_count);
        for (i = 0; i < count; i++) {
                if (is_multimedia_file(name, extlist[i])) {
-                        set_cold_file(inode);
+                        file_set_cold(inode);
                        break;
                }
        }
@@ -149,8 +149,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        alloc_nid_done(sbi, ino);
-        if (!sbi->por_doing)
+        d_instantiate(dentry, inode);
-                d_instantiate(dentry, inode);
        unlock_new_inode(inode);
        return 0;
 out:
@@ -173,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
        f2fs_balance_fs(sbi);
        inode->i_ctime = CURRENT_TIME;
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        set_inode_flag(F2FS_I(inode), FI_INC_LINK);
        ilock = mutex_lock_op(sbi);
@@ -182,17 +181,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
        if (err)
                goto out;
-        /*
-         * This file should be checkpointed during fsync.
-         * We lost i_pino from now on.
-         */
-        set_cp_file(inode);
        d_instantiate(dentry, inode);
        return 0;
 out:
        clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
-        make_bad_inode(inode);
        iput(inode);
        return err;
 }
@@ -498,6 +490,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
        .rmdir          = f2fs_rmdir,
        .mknod          = f2fs_mknod,
        .rename         = f2fs_rename,
+        .getattr        = f2fs_getattr,
        .setattr        = f2fs_setattr,
        .get_acl        = f2fs_get_acl,
 #ifdef CONFIG_F2FS_FS_XATTR
@@ -512,6 +505,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .getattr        = f2fs_getattr,
        .setattr        = f2fs_setattr,
 #ifdef CONFIG_F2FS_FS_XATTR
        .setxattr       = generic_setxattr,
@@ -522,6 +516,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
 };
 const struct inode_operations f2fs_special_inode_operations = {
+        .getattr        = f2fs_getattr,
        .setattr        = f2fs_setattr,
        .get_acl        = f2fs_get_acl,
 #ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3df43b4efd89..b418aee09573 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -408,10 +408,13 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
        level = get_node_path(index, offset, noffset);
        nids[0] = dn->inode->i_ino;
-        npage[0] = get_node_page(sbi, nids[0]);
+        npage[0] = dn->inode_page;
-        if (IS_ERR(npage[0]))
-                return PTR_ERR(npage[0]);
+        if (!npage[0]) {
+                npage[0] = get_node_page(sbi, nids[0]);
+                if (IS_ERR(npage[0]))
+                        return PTR_ERR(npage[0]);
+        }
        parent = npage[0];
        if (level != 0)
                nids[1] = get_nid(parent, offset[0], true);
@@ -430,7 +433,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
                        }
                        dn->nid = nids[i];
-                        npage[i] = new_node_page(dn, noffset[i]);
+                        npage[i] = new_node_page(dn, noffset[i], NULL);
                        if (IS_ERR(npage[i])) {
                                alloc_nid_failed(sbi, nids[i]);
                                err = PTR_ERR(npage[i]);
@@ -803,22 +806,19 @@ int remove_inode_page(struct inode *inode)
        return 0;
 }
-int new_inode_page(struct inode *inode, const struct qstr *name)
+struct page *new_inode_page(struct inode *inode, const struct qstr *name)
 {
-        struct page *page;
        struct dnode_of_data dn;
        /* allocate inode page for new inode */
        set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
-        page = new_node_page(&dn, 0);
-        init_dent_inode(name, page);
+        /* caller should f2fs_put_page(page, 1); */
-        if (IS_ERR(page))
+        return new_node_page(&dn, 0, NULL);
-                return PTR_ERR(page);
-        f2fs_put_page(page, 1);
-        return 0;
 }
-struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
+struct page *new_node_page(struct dnode_of_data *dn,
+                                unsigned int ofs, struct page *ipage)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
        struct address_space *mapping = sbi->node_inode->i_mapping;
@@ -851,7 +851,10 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
        set_cold_node(dn->inode, page);
        dn->node_page = page;
-        sync_inode_page(dn);
+        if (ipage)
+                update_inode(dn->inode, ipage);
+        else
+                sync_inode_page(dn);
        set_page_dirty(page);
        if (ofs == 0)
                inc_valid_inode_count(sbi);
@@ -1205,7 +1208,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
        return 0;
 }
-static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
+                                      unsigned int length)
 {
        struct inode *inode = page->mapping->host;
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1492,9 +1496,10 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
        new_ni = old_ni;
        new_ni.ino = ino;
+        if (!inc_valid_node_count(sbi, NULL, 1))
+                WARN_ON(1);
        set_node_addr(sbi, &new_ni, NEW_ADDR);
        inc_valid_inode_count(sbi);
        f2fs_put_page(ipage, 1);
        return 0;
 }
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 0a2d72f0024d..c65fb4f4230f 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -275,25 +275,27 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
 *  - Mark cold node blocks in their node footer
 *  - Mark cold data pages in page cache
 */
-static inline int is_cold_file(struct inode *inode)
+static inline int is_file(struct inode *inode, int type)
 {
-        return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
+        return F2FS_I(inode)->i_advise & type;
 }
-static inline void set_cold_file(struct inode *inode)
+static inline void set_file(struct inode *inode, int type)
 {
-        F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
+        F2FS_I(inode)->i_advise |= type;
 }
-static inline int is_cp_file(struct inode *inode)
+static inline void clear_file(struct inode *inode, int type)
 {
-        return F2FS_I(inode)->i_advise & FADVISE_CP_BIT;
+        F2FS_I(inode)->i_advise &= ~type;
 }
-static inline void set_cp_file(struct inode *inode)
+#define file_is_cold(inode)     is_file(inode, FADVISE_COLD_BIT)
-{
+#define file_wrong_pino(inode)  is_file(inode, FADVISE_LOST_PINO_BIT)
-        F2FS_I(inode)->i_advise |= FADVISE_CP_BIT;
+#define file_set_cold(inode)    set_file(inode, FADVISE_COLD_BIT)
-}
+#define file_lost_pino(inode)   set_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_clear_cold(inode)  clear_file(inode, FADVISE_COLD_BIT)
+#define file_got_pino(inode)    clear_file(inode, FADVISE_LOST_PINO_BIT)
 static inline int is_cold_data(struct page *page)
 {
@@ -310,29 +312,16 @@ static inline void clear_cold_data(struct page *page)
        ClearPageChecked(page);
 }
-static inline int is_cold_node(struct page *page)
+static inline int is_node(struct page *page, int type)
 {
        void *kaddr = page_address(page);
        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-        unsigned int flag = le32_to_cpu(rn->footer.flag);
+        return le32_to_cpu(rn->footer.flag) & (1 << type);
-        return flag & (0x1 << COLD_BIT_SHIFT);
 }
-static inline unsigned char is_fsync_dnode(struct page *page)
+#define is_cold_node(page)      is_node(page, COLD_BIT_SHIFT)
-{
+#define is_fsync_dnode(page)    is_node(page, FSYNC_BIT_SHIFT)
-        void *kaddr = page_address(page);
+#define is_dent_dnode(page)     is_node(page, DENT_BIT_SHIFT)
-        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-        unsigned int flag = le32_to_cpu(rn->footer.flag);
-        return flag & (0x1 << FSYNC_BIT_SHIFT);
-}
-static inline unsigned char is_dent_dnode(struct page *page)
-{
-        void *kaddr = page_address(page);
-        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-        unsigned int flag = le32_to_cpu(rn->footer.flag);
-        return flag & (0x1 << DENT_BIT_SHIFT);
-}
 static inline void set_cold_node(struct inode *inode, struct page *page)
 {
@@ -346,26 +335,15 @@ static inline void set_cold_node(struct inode *inode, struct page *page)
        rn->footer.flag = cpu_to_le32(flag);
 }
-static inline void set_fsync_mark(struct page *page, int mark)
+static inline void set_mark(struct page *page, int mark, int type)
 {
-        void *kaddr = page_address(page);
+        struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
-        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-        unsigned int flag = le32_to_cpu(rn->footer.flag);
-        if (mark)
-                flag |= (0x1 << FSYNC_BIT_SHIFT);
-        else
-                flag &= ~(0x1 << FSYNC_BIT_SHIFT);
-        rn->footer.flag = cpu_to_le32(flag);
-}
-static inline void set_dentry_mark(struct page *page, int mark)
-{
-        void *kaddr = page_address(page);
-        struct f2fs_node *rn = (struct f2fs_node *)kaddr;
        unsigned int flag = le32_to_cpu(rn->footer.flag);
        if (mark)
-                flag |= (0x1 << DENT_BIT_SHIFT);
+                flag |= (0x1 << type);
        else
-                flag &= ~(0x1 << DENT_BIT_SHIFT);
+                flag &= ~(0x1 << type);
        rn->footer.flag = cpu_to_le32(flag);
 }
+#define set_dentry_mark(page, mark)     set_mark(page, mark, DENT_BIT_SHIFT)
+#define set_fsync_mark(page, mark)      set_mark(page, mark, FSYNC_BIT_SHIFT)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 60c8a5097058..d56d951c2253 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,36 +40,54 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
 static int recover_dentry(struct page *ipage, struct inode *inode)
 {
-        struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
+        void *kaddr = page_address(ipage);
+        struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
        struct f2fs_inode *raw_inode = &(raw_node->i);
-        struct qstr name;
+        nid_t pino = le32_to_cpu(raw_inode->i_pino);
        struct f2fs_dir_entry *de;
+        struct qstr name;
        struct page *page;
-        struct inode *dir;
+        struct inode *dir, *einode;
        int err = 0;
-        if (!is_dent_dnode(ipage))
+        dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino);
-                goto out;
+        if (!dir) {
+                dir = f2fs_iget(inode->i_sb, pino);
-        dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
+                if (IS_ERR(dir)) {
-        if (IS_ERR(dir)) {
+                        err = PTR_ERR(dir);
-                err = PTR_ERR(dir);
+                        goto out;
-                goto out;
+                }
+                set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
+                add_dirty_dir_inode(dir);
        }
        name.len = le32_to_cpu(raw_inode->i_namelen);
        name.name = raw_inode->i_name;
+retry:
        de = f2fs_find_entry(dir, &name, &page);
-        if (de) {
+        if (de && inode->i_ino == le32_to_cpu(de->ino)) {
                kunmap(page);
                f2fs_put_page(page, 0);
-        } else {
+                goto out;
-                err = __f2fs_add_link(dir, &name, inode);
+        }
+        if (de) {
+                einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
+                if (IS_ERR(einode)) {
+                        WARN_ON(1);
+                        if (PTR_ERR(einode) == -ENOENT)
+                                err = -EEXIST;
+                        goto out;
+                }
+                f2fs_delete_entry(de, page, einode);
+                iput(einode);
+                goto retry;
        }
-        iput(dir);
+        err = __f2fs_add_link(dir, &name, inode);
 out:
-        kunmap(ipage);
+        f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
+                        "ino = %x, name = %s, dir = %lx, err = %d",
+                        ino_of_node(ipage), raw_inode->i_name,
+                        IS_ERR(dir) ? 0 : dir->i_ino, err);
        return err;
 }
@@ -79,6 +97,9 @@ static int recover_inode(struct inode *inode, struct page *node_page)
        struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
        struct f2fs_inode *raw_inode = &(raw_node->i);
+        if (!IS_INODE(node_page))
+                return 0;
        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        i_size_write(inode, le64_to_cpu(raw_inode->i_size));
        inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
@@ -88,7 +109,12 @@ static int recover_inode(struct inode *inode, struct page *node_page)
        inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
        inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
-        return recover_dentry(node_page, inode);
+        if (is_dent_dnode(node_page))
+                return recover_dentry(node_page, inode);
+        f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
+                        ino_of_node(node_page), raw_inode->i_name);
+        return 0;
 }
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
@@ -119,14 +145,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
                lock_page(page);
                if (cp_ver != cpver_of_node(page))
-                        goto unlock_out;
+                        break;
                if (!is_fsync_dnode(page))
                        goto next;
                entry = get_fsync_inode(head, ino_of_node(page));
                if (entry) {
-                        entry->blkaddr = blkaddr;
                        if (IS_INODE(page) && is_dent_dnode(page))
                                set_inode_flag(F2FS_I(entry->inode),
                                                        FI_INC_LINK);
@@ -134,48 +159,40 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
                        if (IS_INODE(page) && is_dent_dnode(page)) {
                                err = recover_inode_page(sbi, page);
                                if (err)
-                                        goto unlock_out;
+                                        break;
                        }
                        /* add this fsync inode to the list */
                        entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
                        if (!entry) {
                                err = -ENOMEM;
-                                goto unlock_out;
+                                break;
                        }
                        entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
                        if (IS_ERR(entry->inode)) {
                                err = PTR_ERR(entry->inode);
                                kmem_cache_free(fsync_entry_slab, entry);
-                                goto unlock_out;
+                                break;
                        }
                        list_add_tail(&entry->list, head);
-                        entry->blkaddr = blkaddr;
-                }
-                if (IS_INODE(page)) {
-                        err = recover_inode(entry->inode, page);
-                        if (err == -ENOENT) {
-                                goto next;
-                        } else if (err) {
-                                err = -EINVAL;
-                                goto unlock_out;
-                        }
                }
+                entry->blkaddr = blkaddr;
+                err = recover_inode(entry->inode, page);
+                if (err && err != -ENOENT)
+                        break;
 next:
                /* check next segment */
                blkaddr = next_blkaddr_of_node(page);
        }
-unlock_out:
        unlock_page(page);
 out:
        __free_pages(page, 0);
        return err;
 }
-static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
+static void destroy_fsync_dnodes(struct list_head *head)
-                                        struct list_head *head)
 {
        struct fsync_inode_entry *entry, *tmp;
@@ -186,15 +203,15 @@ static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
        }
 }
-static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
+static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
-                                                block_t blkaddr)
+                        block_t blkaddr, struct dnode_of_data *dn)
 {
        struct seg_entry *sentry;
        unsigned int segno = GET_SEGNO(sbi, blkaddr);
        unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
                                        (sbi->blocks_per_seg - 1);
        struct f2fs_summary sum;
-        nid_t ino;
+        nid_t ino, nid;
        void *kaddr;
        struct inode *inode;
        struct page *node_page;
@@ -203,7 +220,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
        sentry = get_seg_entry(sbi, segno);
        if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
-                return;
+                return 0;
        /* Get the previous summary */
        for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
@@ -222,20 +239,39 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
                f2fs_put_page(sum_page, 1);
        }
+        /* Use the locked dnode page and inode */
+        nid = le32_to_cpu(sum.nid);
+        if (dn->inode->i_ino == nid) {
+                struct dnode_of_data tdn = *dn;
+                tdn.nid = nid;
+                tdn.node_page = dn->inode_page;
+                tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+                truncate_data_blocks_range(&tdn, 1);
+                return 0;
+        } else if (dn->nid == nid) {
+                struct dnode_of_data tdn = *dn;
+                tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+                truncate_data_blocks_range(&tdn, 1);
+                return 0;
+        }
        /* Get the node page */
-        node_page = get_node_page(sbi, le32_to_cpu(sum.nid));
+        node_page = get_node_page(sbi, nid);
+        if (IS_ERR(node_page))
+                return PTR_ERR(node_page);
        bidx = start_bidx_of_node(ofs_of_node(node_page)) +
-                                le16_to_cpu(sum.ofs_in_node);
+                                        le16_to_cpu(sum.ofs_in_node);
        ino = ino_of_node(node_page);
        f2fs_put_page(node_page, 1);
        /* Deallocate previous index in the node page */
        inode = f2fs_iget(sbi->sb, ino);
        if (IS_ERR(inode))
-                return;
+                return PTR_ERR(inode);
        truncate_hole(inode, bidx, bidx + 1);
        iput(inode);
+        return 0;
 }
 static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
@@ -245,7 +281,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        struct dnode_of_data dn;
        struct f2fs_summary sum;
        struct node_info ni;
-        int err = 0;
+        int err = 0, recovered = 0;
        int ilock;
        start = start_bidx_of_node(ofs_of_node(page));
@@ -283,13 +319,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                        }
                        /* Check the previous node page having this index */
-                        check_index_in_prev_nodes(sbi, dest);
+                        err = check_index_in_prev_nodes(sbi, dest, &dn);
+                        if (err)
+                                goto err;
                        set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
                        /* write dummy data page */
                        recover_data_page(sbi, NULL, &sum, src, dest);
                        update_extent_cache(dest, &dn);
+                        recovered++;
                }
                dn.ofs_in_node++;
        }
@@ -305,9 +344,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        set_page_dirty(dn.node_page);
        recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
+err:
        f2fs_put_dnode(&dn);
        mutex_unlock_op(sbi, ilock);
-        return 0;
+        f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
+                        "recovered_data = %d blocks, err = %d",
+                        inode->i_ino, recovered, err);
+        return err;
 }
 static int recover_data(struct f2fs_sb_info *sbi,
@@ -340,7 +384,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
                lock_page(page);
                if (cp_ver != cpver_of_node(page))
-                        goto unlock_out;
+                        break;
                entry = get_fsync_inode(head, ino_of_node(page));
                if (!entry)
@@ -348,7 +392,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
                err = do_recover_data(sbi, entry->inode, page, blkaddr);
                if (err)
-                        goto out;
+                        break;
                if (entry->blkaddr == blkaddr) {
                        iput(entry->inode);
@@ -359,7 +403,6 @@ next:
                /* check next segment */
                blkaddr = next_blkaddr_of_node(page);
        }
-unlock_out:
        unlock_page(page);
 out:
        __free_pages(page, 0);
@@ -382,6 +425,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
        INIT_LIST_HEAD(&inode_list);
        /* step #1: find fsynced inode numbers */
+        sbi->por_doing = 1;
        err = find_fsync_dnodes(sbi, &inode_list);
        if (err)
                goto out;
@@ -390,13 +434,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
                goto out;
        /* step #2: recover data */
-        sbi->por_doing = 1;
        err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
-        sbi->por_doing = 0;
        BUG_ON(!list_empty(&inode_list));
 out:
-        destroy_fsync_dnodes(sbi, &inode_list);
+        destroy_fsync_dnodes(&inode_list);
        kmem_cache_destroy(fsync_entry_slab);
-        write_checkpoint(sbi, false);
+        sbi->por_doing = 0;
+        if (!err)
+                write_checkpoint(sbi, false);
        return err;
 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d8e84e49a5c3..a86d125a9885 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -94,7 +94,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
 * Adding dirty entry into seglist is not critical operation.
 * If a given segment is one of current working segments, it won't be added.
 */
-void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
+static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        unsigned short valid_blocks;
@@ -126,17 +126,16 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-        unsigned int segno, offset = 0;
+        unsigned int segno = -1;
        unsigned int total_segs = TOTAL_SEGS(sbi);
        mutex_lock(&dirty_i->seglist_lock);
        while (1) {
                segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
-                                offset);
+                                segno + 1);
                if (segno >= total_segs)
                        break;
                __set_test_and_free(sbi, segno);
-                offset = segno + 1;
        }
        mutex_unlock(&dirty_i->seglist_lock);
 }
@@ -144,17 +143,16 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 void clear_prefree_segments(struct f2fs_sb_info *sbi)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-        unsigned int segno, offset = 0;
+        unsigned int segno = -1;
        unsigned int total_segs = TOTAL_SEGS(sbi);
        mutex_lock(&dirty_i->seglist_lock);
        while (1) {
                segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
-                                offset);
+                                segno + 1);
                if (segno >= total_segs)
                        break;
-                offset = segno + 1;
                if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
                        dirty_i->nr_dirty[PRE]--;
@@ -257,11 +255,11 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 * This function should be resided under the curseg_mutex lock
 */
 static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
-                struct f2fs_summary *sum, unsigned short offset)
+                                        struct f2fs_summary *sum)
 {
        struct curseg_info *curseg = CURSEG_I(sbi, type);
        void *addr = curseg->sum_blk;
-        addr += offset * sizeof(struct f2fs_summary);
+        addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
        memcpy(addr, sum, sizeof(struct f2fs_summary));
        return;
 }
@@ -311,64 +309,14 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
        f2fs_put_page(page, 1);
 }
-static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, int type)
-{
-        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-        unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
-        unsigned int segno;
-        unsigned int ofs = 0;
-        /*
-         * If there is not enough reserved sections,
-         * we should not reuse prefree segments.
-         */
-        if (has_not_enough_free_secs(sbi, 0))
-                return NULL_SEGNO;
-        /*
-         * NODE page should not reuse prefree segment,
-         * since those information is used for SPOR.
-         */
-        if (IS_NODESEG(type))
-                return NULL_SEGNO;
-next:
-        segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs);
-        ofs += sbi->segs_per_sec;
-        if (segno < TOTAL_SEGS(sbi)) {
-                int i;
-                /* skip intermediate segments in a section */
-                if (segno % sbi->segs_per_sec)
-                        goto next;
-                /* skip if the section is currently used */
-                if (sec_usage_check(sbi, GET_SECNO(sbi, segno)))
-                        goto next;
-                /* skip if whole section is not prefree */
-                for (i = 1; i < sbi->segs_per_sec; i++)
-                        if (!test_bit(segno + i, prefree_segmap))
-                                goto next;
-                /* skip if whole section was not free at the last checkpoint */
-                for (i = 0; i < sbi->segs_per_sec; i++)
-                        if (get_seg_entry(sbi, segno + i)->ckpt_valid_blocks)
-                                goto next;
-                return segno;
-        }
-        return NULL_SEGNO;
-}
 static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
 {
        struct curseg_info *curseg = CURSEG_I(sbi, type);
-        unsigned int segno = curseg->segno;
+        unsigned int segno = curseg->segno + 1;
        struct free_segmap_info *free_i = FREE_I(sbi);
-        if (segno + 1 < TOTAL_SEGS(sbi) && (segno + 1) % sbi->segs_per_sec)
+        if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
-                return !test_bit(segno + 1, free_i->free_segmap);
+                return !test_bit(segno, free_i->free_segmap);
        return 0;
 }
@@ -495,7 +443,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
        int dir = ALLOC_LEFT;
        write_sum_page(sbi, curseg->sum_blk,
-                                GET_SUM_BLOCK(sbi, curseg->segno));
+                                GET_SUM_BLOCK(sbi, segno));
        if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
                dir = ALLOC_RIGHT;
@@ -599,11 +547,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
                goto out;
        }
-        curseg->next_segno = check_prefree_segments(sbi, type);
+        if (type == CURSEG_WARM_NODE)
-        if (curseg->next_segno != NULL_SEGNO)
-                change_curseg(sbi, type, false);
-        else if (type == CURSEG_WARM_NODE)
                new_curseg(sbi, type, false);
        else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
                new_curseg(sbi, type, false);
@@ -612,7 +556,10 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
        else
                new_curseg(sbi, type, false);
 out:
+#ifdef CONFIG_F2FS_STAT_FS
        sbi->segment_count[curseg->alloc_type]++;
+#endif
+        return;
 }
 void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -795,7 +742,7 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
                if (S_ISDIR(inode->i_mode))
                        return CURSEG_HOT_DATA;
-                else if (is_cold_data(page) || is_cold_file(inode))
+                else if (is_cold_data(page) || file_is_cold(inode))
                        return CURSEG_COLD_DATA;
                else
                        return CURSEG_WARM_DATA;
@@ -844,11 +791,13 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
         * because, this function updates a summary entry in the
         * current summary block.
         */
-        __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+        __add_sum_entry(sbi, type, sum);
        mutex_lock(&sit_i->sentry_lock);
        __refresh_next_blkoff(sbi, curseg);
+#ifdef CONFIG_F2FS_STAT_FS
        sbi->block_count[curseg->alloc_type]++;
+#endif
        /*
         * SIT information should be updated before segment allocation,
@@ -943,7 +892,7 @@ void recover_data_page(struct f2fs_sb_info *sbi,
        curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
                                        (sbi->blocks_per_seg - 1);
-        __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+        __add_sum_entry(sbi, type, sum);
        refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
@@ -980,7 +929,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
        }
        curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
                                        (sbi->blocks_per_seg - 1);
-        __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+        __add_sum_entry(sbi, type, sum);
        /* change the current log to the next block addr in advance */
        if (next_segno != segno) {
@@ -1579,13 +1528,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        struct free_segmap_info *free_i = FREE_I(sbi);
-        unsigned int segno = 0, offset = 0;
+        unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
        unsigned short valid_blocks;
-        while (segno < TOTAL_SEGS(sbi)) {
+        while (1) {
                /* find dirty segment based on free segmap */
-                segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
+                segno = find_next_inuse(free_i, total_segs, offset);
-                if (segno >= TOTAL_SEGS(sbi))
+                if (segno >= total_segs)
                        break;
                offset = segno + 1;
                valid_blocks = get_valid_blocks(sbi, segno, 0);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8555f7df82c7..75c7dc363e92 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -34,7 +34,7 @@
 static struct kmem_cache *f2fs_inode_cachep;
 enum {
-        Opt_gc_background_off,
+        Opt_gc_background,
        Opt_disable_roll_forward,
        Opt_discard,
        Opt_noheap,
@@ -46,7 +46,7 @@ enum {
 };
 static match_table_t f2fs_tokens = {
-        {Opt_gc_background_off, "background_gc_off"},
+        {Opt_gc_background, "background_gc=%s"},
        {Opt_disable_roll_forward, "disable_roll_forward"},
        {Opt_discard, "discard"},
        {Opt_noheap, "no_heap"},
@@ -76,6 +76,91 @@ static void init_once(void *foo)
        inode_init_once(&fi->vfs_inode);
 }
+static int parse_options(struct super_block *sb, char *options)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        substring_t args[MAX_OPT_ARGS];
+        char *p, *name;
+        int arg = 0;
+        if (!options)
+                return 0;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                /*
+                 * Initialize args struct so we know whether arg was
+                 * found; some options take optional arguments.
+                 */
+                args[0].to = args[0].from = NULL;
+                token = match_token(p, f2fs_tokens, args);
+                switch (token) {
+                case Opt_gc_background:
+                        name = match_strdup(&args[0]);
+                        if (!name)
+                                return -ENOMEM;
+                        if (!strncmp(name, "on", 2))
+                                set_opt(sbi, BG_GC);
+                        else if (!strncmp(name, "off", 3))
+                                clear_opt(sbi, BG_GC);
+                        else {
+                                kfree(name);
+                                return -EINVAL;
+                        }
+                        kfree(name);
+                        break;
+                case Opt_disable_roll_forward:
+                        set_opt(sbi, DISABLE_ROLL_FORWARD);
+                        break;
+                case Opt_discard:
+                        set_opt(sbi, DISCARD);
+                        break;
+                case Opt_noheap:
+                        set_opt(sbi, NOHEAP);
+                        break;
+#ifdef CONFIG_F2FS_FS_XATTR
+                case Opt_nouser_xattr:
+                        clear_opt(sbi, XATTR_USER);
+                        break;
+#else
+                case Opt_nouser_xattr:
+                        f2fs_msg(sb, KERN_INFO,
+                                "nouser_xattr options not supported");
+                        break;
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+                case Opt_noacl:
+                        clear_opt(sbi, POSIX_ACL);
+                        break;
+#else
+                case Opt_noacl:
+                        f2fs_msg(sb, KERN_INFO, "noacl options not supported");
+                        break;
+#endif
+                case Opt_active_logs:
+                        if (args->from && match_int(args, &arg))
+                                return -EINVAL;
+                        if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
+                                return -EINVAL;
+                        sbi->active_logs = arg;
+                        break;
+                case Opt_disable_ext_identify:
+                        set_opt(sbi, DISABLE_EXT_IDENTIFY);
+                        break;
+                default:
+                        f2fs_msg(sb, KERN_ERR,
+                                "Unrecognized mount option \"%s\" or missing value",
+                                p);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
 static struct inode *f2fs_alloc_inode(struct super_block *sb)
 {
        struct f2fs_inode_info *fi;
@@ -112,6 +197,17 @@ static int f2fs_drop_inode(struct inode *inode)
        return generic_drop_inode(inode);
 }
+/*
+ * f2fs_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We should call set_dirty_inode to write the dirty inode through write_inode.
+ */
+static void f2fs_dirty_inode(struct inode *inode, int flags)
+{
+        set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+        return;
+}
 static void f2fs_i_callback(struct rcu_head *head)
 {
        struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -170,7 +266,7 @@ static int f2fs_freeze(struct super_block *sb)
 {
        int err;
-        if (sb->s_flags & MS_RDONLY)
+        if (f2fs_readonly(sb))
                return 0;
        err = f2fs_sync_fs(sb, 1);
@@ -214,10 +310,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
-        if (test_opt(sbi, BG_GC))
+        if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC))
-                seq_puts(seq, ",background_gc_on");
+                seq_printf(seq, ",background_gc=%s", "on");
        else
-                seq_puts(seq, ",background_gc_off");
+                seq_printf(seq, ",background_gc=%s", "off");
        if (test_opt(sbi, DISABLE_ROLL_FORWARD))
                seq_puts(seq, ",disable_roll_forward");
        if (test_opt(sbi, DISCARD))
@@ -244,11 +340,64 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
        return 0;
 }
+static int f2fs_remount(struct super_block *sb, int *flags, char *data)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        struct f2fs_mount_info org_mount_opt;
+        int err, active_logs;
+        /*
+         * Save the old mount options in case we
+         * need to restore them.
+         */
+        org_mount_opt = sbi->mount_opt;
+        active_logs = sbi->active_logs;
+        /* parse mount options */
+        err = parse_options(sb, data);
+        if (err)
+                goto restore_opts;
+        /*
+         * Previous and new state of filesystem is RO,
+         * so no point in checking GC conditions.
+         */
+        if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+                goto skip;
+        /*
+         * We stop the GC thread if FS is mounted as RO
+         * or if background_gc = off is passed in mount
+         * option. Also sync the filesystem.
+         */
+        if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
+                if (sbi->gc_thread) {
+                        stop_gc_thread(sbi);
+                        f2fs_sync_fs(sb, 1);
+                }
+        } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) {
+                err = start_gc_thread(sbi);
+                if (err)
+                        goto restore_opts;
+        }
+skip:
+        /* Update the POSIXACL Flag */
+         sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+                (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+        return 0;
+restore_opts:
+        sbi->mount_opt = org_mount_opt;
+        sbi->active_logs = active_logs;
+        return err;
+}
 static struct super_operations f2fs_sops = {
        .alloc_inode    = f2fs_alloc_inode,
        .drop_inode     = f2fs_drop_inode,
        .destroy_inode  = f2fs_destroy_inode,
        .write_inode    = f2fs_write_inode,
+        .dirty_inode    = f2fs_dirty_inode,
        .show_options   = f2fs_show_options,
        .evict_inode    = f2fs_evict_inode,
        .put_super      = f2fs_put_super,
@@ -256,6 +405,7 @@ static struct super_operations f2fs_sops = {
        .freeze_fs      = f2fs_freeze,
        .unfreeze_fs    = f2fs_unfreeze,
        .statfs         = f2fs_statfs,
+        .remount_fs     = f2fs_remount,
 };
 static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
@@ -303,79 +453,6 @@ static const struct export_operations f2fs_export_ops = {
        .get_parent = f2fs_get_parent,
 };
-static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
-                                char *options)
-{
-        substring_t args[MAX_OPT_ARGS];
-        char *p;
-        int arg = 0;
-        if (!options)
-                return 0;
-        while ((p = strsep(&options, ",")) != NULL) {
-                int token;
-                if (!*p)
-                        continue;
-                /*
-                 * Initialize args struct so we know whether arg was
-                 * found; some options take optional arguments.
-                 */
-                args[0].to = args[0].from = NULL;
-                token = match_token(p, f2fs_tokens, args);
-                switch (token) {
-                case Opt_gc_background_off:
-                        clear_opt(sbi, BG_GC);
-                        break;
-                case Opt_disable_roll_forward:
-                        set_opt(sbi, DISABLE_ROLL_FORWARD);
-                        break;
-                case Opt_discard:
-                        set_opt(sbi, DISCARD);
-                        break;
-                case Opt_noheap:
-                        set_opt(sbi, NOHEAP);
-                        break;
-#ifdef CONFIG_F2FS_FS_XATTR
-                case Opt_nouser_xattr:
-                        clear_opt(sbi, XATTR_USER);
-                        break;
-#else
-                case Opt_nouser_xattr:
-                        f2fs_msg(sb, KERN_INFO,
-                                "nouser_xattr options not supported");
-                        break;
-#endif
-#ifdef CONFIG_F2FS_FS_POSIX_ACL
-                case Opt_noacl:
-                        clear_opt(sbi, POSIX_ACL);
-                        break;
-#else
-                case Opt_noacl:
-                        f2fs_msg(sb, KERN_INFO, "noacl options not supported");
-                        break;
-#endif
-                case Opt_active_logs:
-                        if (args->from && match_int(args, &arg))
-                                return -EINVAL;
-                        if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
-                                return -EINVAL;
-                        sbi->active_logs = arg;
-                        break;
-                case Opt_disable_ext_identify:
-                        set_opt(sbi, DISABLE_EXT_IDENTIFY);
-                        break;
-                default:
-                        f2fs_msg(sb, KERN_ERR,
-                                "Unrecognized mount option \"%s\" or missing value",
-                                p);
-                        return -EINVAL;
-                }
-        }
-        return 0;
-}
 static loff_t max_file_size(unsigned bits)
 {
        loff_t result = ADDRS_PER_INODE;
@@ -541,6 +618,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
                if (err)
                        goto free_sb_buf;
        }
+        sb->s_fs_info = sbi;
        /* init some FS parameters */
        sbi->active_logs = NR_CURSEG_TYPE;
@@ -553,7 +631,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        set_opt(sbi, POSIX_ACL);
 #endif
        /* parse mount options */
-        err = parse_options(sb, sbi, (char *)data);
+        err = parse_options(sb, (char *)data);
        if (err)
                goto free_sb_buf;
@@ -565,7 +643,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_xattr = f2fs_xattr_handlers;
        sb->s_export_op = &f2fs_export_ops;
        sb->s_magic = F2FS_SUPER_MAGIC;
-        sb->s_fs_info = sbi;
        sb->s_time_gran = 1;
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
                (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -674,10 +751,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
                                "Cannot recover all fsync data errno=%ld", err);
        }
-        /* After POR, we can run background GC thread */
+        /*
-        err = start_gc_thread(sbi);
+         * If filesystem is not mounted as read-only then
-        if (err)
+         * do start the gc_thread.
-                goto fail;
+         */
+        if (!(sb->s_flags & MS_RDONLY)) {
+                /* After POR, we can run background GC thread.*/
+                err = start_gc_thread(sbi);
+                if (err)
+                        goto fail;
+        }
        err = f2fs_build_stats(sbi);
        if (err)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 0b02dce31356..3ab07ecd86ca 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -20,6 +20,7 @@
 */
 #include <linux/rwsem.h>
 #include <linux/f2fs_fs.h>
+#include <linux/security.h>
 #include "f2fs.h"
 #include "xattr.h"
@@ -43,6 +44,10 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
                prefix = XATTR_TRUSTED_PREFIX;
                prefix_len = XATTR_TRUSTED_PREFIX_LEN;
                break;
+        case F2FS_XATTR_INDEX_SECURITY:
+                prefix = XATTR_SECURITY_PREFIX;
+                prefix_len = XATTR_SECURITY_PREFIX_LEN;
+                break;
        default:
                return -EINVAL;
        }
@@ -50,7 +55,7 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
        total_len = prefix_len + name_len + 1;
        if (list && total_len <= list_size) {
                memcpy(list, prefix, prefix_len);
-                memcpy(list+prefix_len, name, name_len);
+                memcpy(list + prefix_len, name, name_len);
                list[prefix_len + name_len] = '\0';
        }
        return total_len;
@@ -70,13 +75,14 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                break;
+        case F2FS_XATTR_INDEX_SECURITY:
+                break;
        default:
                return -EINVAL;
        }
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return f2fs_getxattr(dentry->d_inode, type, name,
+        return f2fs_getxattr(dentry->d_inode, type, name, buffer, size);
-                        buffer, size);
 }
 static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
@@ -93,13 +99,15 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                break;
+        case F2FS_XATTR_INDEX_SECURITY:
+                break;
        default:
                return -EINVAL;
        }
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return f2fs_setxattr(dentry->d_inode, type, name, value, size);
+        return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL);
 }
 static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
@@ -145,6 +153,31 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
        return 0;
 }
+#ifdef CONFIG_F2FS_FS_SECURITY
+static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+                void *page)
+{
+        const struct xattr *xattr;
+        int err = 0;
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+                err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
+                                xattr->name, xattr->value,
+                                xattr->value_len, (struct page *)page);
+                if (err < 0)
+                        break;
+        }
+        return err;
+}
+int f2fs_init_security(struct inode *inode, struct inode *dir,
+                                const struct qstr *qstr, struct page *ipage)
+{
+        return security_inode_init_security(inode, dir, qstr,
+                                &f2fs_initxattrs, ipage);
+}
+#endif
 const struct xattr_handler f2fs_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .flags  = F2FS_XATTR_INDEX_USER,
@@ -169,6 +202,14 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
        .set    = f2fs_xattr_advise_set,
 };
+const struct xattr_handler f2fs_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .flags  = F2FS_XATTR_INDEX_SECURITY,
+        .list   = f2fs_xattr_generic_list,
+        .get    = f2fs_xattr_generic_get,
+        .set    = f2fs_xattr_generic_set,
+};
 static const struct xattr_handler *f2fs_xattr_handler_map[] = {
        [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -176,6 +217,9 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
        [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
 #endif
        [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+        [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler,
+#endif
        [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
 };
@@ -186,6 +230,9 @@ const struct xattr_handler *f2fs_xattr_handlers[] = {
        &f2fs_xattr_acl_default_handler,
 #endif
        &f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+        &f2fs_xattr_security_handler,
+#endif
        &f2fs_xattr_advise_handler,
        NULL,
 };
@@ -218,6 +265,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
                return -ENODATA;
        page = get_node_page(sbi, fi->i_xattr_nid);
+        if (IS_ERR(page))
+                return PTR_ERR(page);
        base_addr = page_address(page);
        list_for_each_xattr(entry, base_addr) {
@@ -268,6 +317,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
                return 0;
        page = get_node_page(sbi, fi->i_xattr_nid);
+        if (IS_ERR(page))
+                return PTR_ERR(page);
        base_addr = page_address(page);
        list_for_each_xattr(entry, base_addr) {
@@ -296,7 +347,7 @@ cleanup:
 }
 int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
-                                        const void *value, size_t value_len)
+                        const void *value, size_t value_len, struct page *ipage)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -335,7 +386,7 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
                set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
                mark_inode_dirty(inode);
-                page = new_node_page(&dn, XATTR_NODE_OFFSET);
+                page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
                if (IS_ERR(page)) {
                        alloc_nid_failed(sbi, fi->i_xattr_nid);
                        fi->i_xattr_nid = 0;
@@ -435,7 +486,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
                inode->i_ctime = CURRENT_TIME;
                clear_inode_flag(fi, FI_ACL_MODE);
        }
-        update_inode_page(inode);
+        if (ipage)
+                update_inode(inode, ipage);
+        else
+                update_inode_page(inode);
        mutex_unlock_op(sbi, ilock);
        return 0;
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 49c9558305e3..3c0817bef25d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -112,21 +112,19 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler;
 extern const struct xattr_handler f2fs_xattr_acl_access_handler;
 extern const struct xattr_handler f2fs_xattr_acl_default_handler;
 extern const struct xattr_handler f2fs_xattr_advise_handler;
+extern const struct xattr_handler f2fs_xattr_security_handler;
 extern const struct xattr_handler *f2fs_xattr_handlers[];
-extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
+extern int f2fs_setxattr(struct inode *, int, const char *,
-                const void *value, size_t value_len);
+                                const void *, size_t, struct page *);
-extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
+extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t);
-                void *buffer, size_t buffer_size);
+extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
-extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
-                size_t buffer_size);
 #else
 #define f2fs_xattr_handlers     NULL
 static inline int f2fs_setxattr(struct inode *inode, int name_index,
-        const char *name, const void *value, size_t value_len)
+                const char *name, const void *value, size_t value_len)
 {
        return -EOPNOTSUPP;
 }
@@ -142,4 +140,14 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
 }
 #endif
+#ifdef CONFIG_F2FS_FS_SECURITY
+extern int f2fs_init_security(struct inode *, struct inode *,
+                                const struct qstr *, struct page *);
+#else
+static inline int f2fs_init_security(struct inode *inode, struct inode *dir,
+                                const struct qstr *qstr, struct page *ipage)
+{
+        return 0;
+}
+#endif
 #endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 7a6f02caf286..3963ede84eb0 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -543,6 +543,7 @@ end_of_dir:
 EXPORT_SYMBOL_GPL(fat_search_long);
 struct fat_ioctl_filldir_callback {
+        struct dir_context ctx;
        void __user *dirent;
        int result;
        /* for dir ioctl */
@@ -552,8 +553,9 @@ struct fat_ioctl_filldir_callback {
        int short_len;
 };
-static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
+static int __fat_readdir(struct inode *inode, struct file *file,
-                         filldir_t filldir, int short_only, int both)
+                         struct dir_context *ctx, int short_only,
+                         struct fat_ioctl_filldir_callback *both)
 {
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -564,27 +566,20 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
        unsigned char bufname[FAT_MAX_SHORT_SIZE];
        int isvfat = sbi->options.isvfat;
        const char *fill_name = NULL;
-        unsigned long inum;
+        int fake_offset = 0;
-        unsigned long lpos, dummy, *furrfu = &lpos;
        loff_t cpos;
        int short_len = 0, fill_len = 0;
        int ret = 0;
        mutex_lock(&sbi->s_lock);
-        cpos = filp->f_pos;
+        cpos = ctx->pos;
        /* Fake . and .. for the root directory. */
        if (inode->i_ino == MSDOS_ROOT_INO) {
-                while (cpos < 2) {
+                if (!dir_emit_dots(file, ctx))
-                        if (filldir(dirent, "..", cpos+1, cpos,
+                        goto out;
-                                    MSDOS_ROOT_INO, DT_DIR) < 0)
+                if (ctx->pos == 2) {
-                                goto out;
+                        fake_offset = 1;
-                        cpos++;
-                        filp->f_pos++;
-                }
-                if (cpos == 2) {
-                        dummy = 2;
-                        furrfu = &dummy;
                        cpos = 0;
                }
        }
@@ -619,7 +614,7 @@ parse_record:
                int status = fat_parse_long(inode, &cpos, &bh, &de,
                                            &unicode, &nr_slots);
                if (status < 0) {
-                        filp->f_pos = cpos;
+                        ctx->pos = cpos;
                        ret = status;
                        goto out;
                } else if (status == PARSE_INVALID)
@@ -639,6 +634,19 @@ parse_record:
                        /* !both && !short_only, so we don't need shortname. */
                        if (!both)
                                goto start_filldir;
+                        short_len = fat_parse_short(sb, de, bufname,
+                                                    sbi->options.dotsOK);
+                        if (short_len == 0)
+                                goto record_end;
+                        /* hack for fat_ioctl_filldir() */
+                        both->longname = fill_name;
+                        both->long_len = fill_len;
+                        both->shortname = bufname;
+                        both->short_len = short_len;
+                        fill_name = NULL;
+                        fill_len = 0;
+                        goto start_filldir;
                }
        }
@@ -646,28 +654,21 @@ parse_record:
        if (short_len == 0)
                goto record_end;
-        if (nr_slots) {
+        fill_name = bufname;
-                /* hack for fat_ioctl_filldir() */
+        fill_len = short_len;
-                struct fat_ioctl_filldir_callback *p = dirent;
-                p->longname = fill_name;
-                p->long_len = fill_len;
-                p->shortname = bufname;
-                p->short_len = short_len;
-                fill_name = NULL;
-                fill_len = 0;
-        } else {
-                fill_name = bufname;
-                fill_len = short_len;
-        }
 start_filldir:
-        lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+        if (!fake_offset)
-        if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME))
+                ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
-                inum = inode->i_ino;
-        else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
+        if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
-                inum = parent_ino(filp->f_path.dentry);
+                if (!dir_emit_dot(file, ctx))
+                        goto fill_failed;
+        } else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
+                if (!dir_emit_dotdot(file, ctx))
+                        goto fill_failed;
        } else {
+                unsigned long inum;
                loff_t i_pos = fat_make_i_pos(sb, bh, de);
                struct inode *tmp = fat_iget(sb, i_pos);
                if (tmp) {
@@ -675,18 +676,17 @@ start_filldir:
                        iput(tmp);
                } else
                        inum = iunique(sb, MSDOS_ROOT_INO);
+                if (!dir_emit(ctx, fill_name, fill_len, inum,
+                            (de->attr & ATTR_DIR) ? DT_DIR : DT_REG))
+                        goto fill_failed;
        }
-        if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
-                    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
-                goto fill_failed;
 record_end:
-        furrfu = &lpos;
+        fake_offset = 0;
-        filp->f_pos = cpos;
+        ctx->pos = cpos;
        goto get_new;
 end_of_dir:
-        filp->f_pos = cpos;
+        ctx->pos = cpos;
 fill_failed:
        brelse(bh);
        if (unicode)
@@ -696,10 +696,9 @@ out:
        return ret;
 }
-static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int fat_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        return __fat_readdir(file_inode(file), file, ctx, 0, NULL);
-        return __fat_readdir(inode, filp, dirent, filldir, 0, 0);
 }
 #define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type)                          \
@@ -755,20 +754,25 @@ efault:									   \
 FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
-static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
+static int fat_ioctl_readdir(struct inode *inode, struct file *file,
                             void __user *dirent, filldir_t filldir,
                             int short_only, int both)
 {
-        struct fat_ioctl_filldir_callback buf;
+        struct fat_ioctl_filldir_callback buf = {
+                .ctx.actor = filldir,
+                .dirent = dirent
+        };
        int ret;
        buf.dirent = dirent;
        buf.result = 0;
        mutex_lock(&inode->i_mutex);
+        buf.ctx.pos = file->f_pos;
        ret = -ENOENT;
        if (!IS_DEADDIR(inode)) {
-                ret = __fat_readdir(inode, filp, &buf, filldir,
+                ret = __fat_readdir(inode, file, &buf.ctx,
-                                    short_only, both);
+                                    short_only, both ? &buf : NULL);
+                file->f_pos = buf.ctx.pos;
        }
        mutex_unlock(&inode->i_mutex);
        if (ret >= 0)
@@ -854,7 +858,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
 const struct file_operations fat_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = fat_readdir,
+        .iterate        = fat_readdir,
        .unlocked_ioctl = fat_dir_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = fat_compat_dir_ioctl,
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 21664fcf3616..4241e6f39e86 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -86,6 +86,7 @@ struct msdos_sb_info {
        const void *dir_ops;          /* Opaque; default directory operations */
        int dir_per_block;            /* dir entries per block */
        int dir_per_block_bits;       /* log2(dir_per_block) */
+        unsigned int vol_id;            /*volume ID*/
        int fatent_shift;
        struct fatent_operations *fatent_ops;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b0b632e50ddb..9b104f543056 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -114,6 +114,12 @@ out:
        return err;
 }
+static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
+{
+        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+        return put_user(sbi->vol_id, user_attr);
+}
 long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = file_inode(filp);
@@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return fat_ioctl_get_attributes(inode, user_attr);
        case FAT_IOCTL_SET_ATTRIBUTES:
                return fat_ioctl_set_attributes(filp, user_attr);
+        case FAT_IOCTL_GET_VOLUME_ID:
+                return fat_ioctl_get_volume_id(inode, user_attr);
        default:
                return -ENOTTY; /* Inappropriate ioctl for device */
        }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5d4513cb1b3c..11b51bb55b42 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1415,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
                brelse(fsinfo_bh);
        }
+        /* interpret volume ID as a little endian 32 bit integer */
+        if (sbi->fat_bits == 32)
+                sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
+                                        ((u32)b->fat32.vol_id[1] << 8) |
+                                        ((u32)b->fat32.vol_id[2] << 16) |
+                                        ((u32)b->fat32.vol_id[3] << 24));
+        else /* fat 16 or 12 */
+                sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
+                                        ((u32)b->fat16.vol_id[1] << 8) |
+                                        ((u32)b->fat16.vol_id[2] << 16) |
+                                        ((u32)b->fat16.vol_id[3] << 24));
        sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
        sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 359d307b5507..628e22a5a543 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
-                printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf);
+                fat_msg(sb, KERN_ERR, "error, %pV", &vaf);
                va_end(args);
        }
@@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
                panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
        else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
                sb->s_flags |= MS_RDONLY;
-                printk(KERN_ERR "FAT-fs (%s): Filesystem has been "
+                fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
-                                "set read-only\n", sb->s_id);
        }
 }
 EXPORT_SYMBOL_GPL(__fat_fs_error);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 081b759cff83..a783b0e1272a 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,8 +148,7 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
 * that the existing dentry can be used. The msdos fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
+static int msdos_hash(const struct dentry *dentry, struct qstr *qstr)
-               struct qstr *qstr)
 {
        struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
        unsigned char msdos_name[MSDOS_NAME];
@@ -165,8 +164,7 @@ static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
 * Compare two msdos names. If either of the names are invalid,
 * we fall back to doing the standard name comparison.
 */
-static int msdos_cmp(const struct dentry *parent, const struct inode *pinode,
+static int msdos_cmp(const struct dentry *parent, const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 2da952036a3d..6df8d3d885e5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -107,8 +107,7 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr)
 * that the existing dentry can be used. The vfat fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
+static int vfat_hash(const struct dentry *dentry, struct qstr *qstr)
-                struct qstr *qstr)
 {
        qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
        return 0;
@@ -120,8 +119,7 @@ static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
 * that the existing dentry can be used. The vfat fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
+static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr)
-                struct qstr *qstr)
 {
        struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
        const unsigned char *name;
@@ -142,8 +140,7 @@ static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
 /*
 * Case insensitive compare of two vfat names.
 */
-static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
+static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
@@ -162,8 +159,7 @@ static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
 /*
 * Case sensitive compare of two vfat names.
 */
-static int vfat_cmp(const struct dentry *parent, const struct inode *pinode,
+static int vfat_cmp(const struct dentry *parent, const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        unsigned int alen, blen;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 6599222536eb..65343c3741ff 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -730,14 +730,14 @@ static int __init fcntl_init(void)
         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
         * is defined as O_NONBLOCK on some platforms and not on others.
         */
-        BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+        BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
                O_RDONLY        | O_WRONLY      | O_RDWR        |
                O_CREAT         | O_EXCL        | O_NOCTTY      |
                O_TRUNC         | O_APPEND      | /* O_NONBLOCK | */
                __O_SYNC        | O_DSYNC       | FASYNC        |
                O_DIRECT        | O_LARGEFILE   | O_DIRECTORY   |
                O_NOFOLLOW      | O_NOATIME     | O_CLOEXEC     |
-                __FMODE_EXEC    | O_PATH
+                __FMODE_EXEC    | O_PATH        | __O_TMPFILE
                ));
        fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/file_table.c b/fs/file_table.c
index 485dc0eddd67..b44e4c559786 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -227,7 +227,7 @@ static void __fput(struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
        struct vfsmount *mnt = file->f_path.mnt;
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_inode;
        might_sleep();
@@ -265,18 +265,15 @@ static void __fput(struct file *file)
        mntput(mnt);
 }
-static DEFINE_SPINLOCK(delayed_fput_lock);
+static LLIST_HEAD(delayed_fput_list);
-static LIST_HEAD(delayed_fput_list);
 static void delayed_fput(struct work_struct *unused)
 {
-        LIST_HEAD(head);
+        struct llist_node *node = llist_del_all(&delayed_fput_list);
-        spin_lock_irq(&delayed_fput_lock);
+        struct llist_node *next;
-        list_splice_init(&delayed_fput_list, &head);
-        spin_unlock_irq(&delayed_fput_lock);
+        for (; node; node = next) {
-        while (!list_empty(&head)) {
+                next = llist_next(node);
-                struct file *f = list_first_entry(&head, struct file, f_u.fu_list);
+                __fput(llist_entry(node, struct file, f_u.fu_llist));
-                list_del_init(&f->f_u.fu_list);
-                __fput(f);
        }
 }
@@ -306,18 +303,22 @@ void fput(struct file *file)
 {
        if (atomic_long_dec_and_test(&file->f_count)) {
                struct task_struct *task = current;
-                unsigned long flags;
                file_sb_list_del(file);
                if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
                        init_task_work(&file->f_u.fu_rcuhead, ____fput);
                        if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
                                return;
+                        /*
+                         * After this task has run exit_task_work(),
+                         * task_work_add() will fail.  free_ipc_ns()->
+                         * shm_destroy() can do this.  Fall through to delayed
+                         * fput to avoid leaking *file.
+                         */
                }
-                spin_lock_irqsave(&delayed_fput_lock, flags);
-                list_add(&file->f_u.fu_list, &delayed_fput_list);
+                if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
-                schedule_work(&delayed_fput_work);
+                        schedule_work(&delayed_fput_work);
-                spin_unlock_irqrestore(&delayed_fput_lock, flags);
        }
 }
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 664b07a53870..25d4099a4aea 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -49,7 +49,7 @@
 static struct dentry *  vxfs_lookup(struct inode *, struct dentry *, unsigned int);
-static int              vxfs_readdir(struct file *, void *, filldir_t);
+static int              vxfs_readdir(struct file *, struct dir_context *);
 const struct inode_operations vxfs_dir_inode_ops = {
        .lookup =               vxfs_lookup,
@@ -58,7 +58,7 @@ const struct inode_operations vxfs_dir_inode_ops = {
 const struct file_operations vxfs_dir_operations = {
        .llseek =               generic_file_llseek,
        .read =                 generic_read_dir,
-        .readdir =              vxfs_readdir,
+        .iterate =              vxfs_readdir,
 };
 
@@ -235,7 +235,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
 *   Zero.
 */
 static int
-vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
+vxfs_readdir(struct file *fp, struct dir_context *ctx)
 {
        struct inode            *ip = file_inode(fp);
        struct super_block      *sbp = ip->i_sb;
@@ -243,20 +243,17 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
        u_long                  page, npages, block, pblocks, nblocks, offset;
        loff_t                  pos;
-        switch ((long)fp->f_pos) {
+        if (ctx->pos == 0) {
-        case 0:
+                if (!dir_emit_dot(fp, ctx))
-                if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
+                        return 0;
-                        goto out;
+                ctx->pos = 1;
-                fp->f_pos++;
-                /* fallthrough */
-        case 1:
-                if (filler(retp, "..", 2, fp->f_pos, VXFS_INO(ip)->vii_dotdot, DT_DIR) < 0)
-                        goto out;
-                fp->f_pos++;
-                /* fallthrough */
        }
+        if (ctx->pos == 1) {
-        pos = fp->f_pos - 2;
+                if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR))
+                        return 0;
+                ctx->pos = 2;
+        }
+        pos = ctx->pos - 2;
        
        if (pos > VXFS_DIRROUND(ip->i_size))
                return 0;
@@ -270,16 +267,16 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
        block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
        for (; page < npages; page++, block = 0) {
-                caddr_t                 kaddr;
+                char                    *kaddr;
                struct page             *pp;
                pp = vxfs_get_page(ip->i_mapping, page);
                if (IS_ERR(pp))
                        continue;
-                kaddr = (caddr_t)page_address(pp);
+                kaddr = (char *)page_address(pp);
                for (; block <= nblocks && block <= pblocks; block++) {
-                        caddr_t                 baddr, limit;
+                        char                    *baddr, *limit;
                        struct vxfs_dirblk      *dbp;
                        struct vxfs_direct      *de;
@@ -292,21 +289,18 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
                                 (kaddr + offset) :
                                 (baddr + VXFS_DIRBLKOV(dbp)));
-                        for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) {
+                        for (; (char *)de <= limit; de = vxfs_next_entry(de)) {
-                                int     over;
                                if (!de->d_reclen)
                                        break;
                                if (!de->d_ino)
                                        continue;
-                                offset = (caddr_t)de - kaddr;
+                                offset = (char *)de - kaddr;
-                                over = filler(retp, de->d_name, de->d_namelen,
+                                ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
-                                        ((page << PAGE_CACHE_SHIFT) | offset) + 2,
+                                if (!dir_emit(ctx, de->d_name, de->d_namelen,
-                                        de->d_ino, DT_UNKNOWN);
+                                        de->d_ino, DT_UNKNOWN)) {
-                                if (over) {
                                        vxfs_put_page(pp);
-                                        goto done;
+                                        return 0;
                                }
                        }
                        offset = 0;
@@ -314,9 +308,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
                vxfs_put_page(pp);
                offset = 0;
        }
+        ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
-done:
-        fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
-out:
        return 0;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3be57189efd5..68851ff2fd41 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,6 +45,7 @@ struct wb_writeback_work {
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
+        unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
        enum wb_reason reason;          /* why was writeback initiated? */
        struct list_head list;          /* pending work list */
@@ -443,9 +444,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        /*
         * Make sure to wait on the data before writing out the metadata.
         * This is important for filesystems that modify metadata on data
-         * I/O completion.
+         * I/O completion. We don't do it for sync(2) writeback because it has a
+         * separate, external IO completion path and ->sync_fs for guaranteeing
+         * inode metadata is written back correctly.
         */
-        if (wbc->sync_mode == WB_SYNC_ALL) {
+        if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
                int err = filemap_fdatawait(mapping);
                if (ret == 0)
                        ret = err;
@@ -578,6 +581,7 @@ static long writeback_sb_inodes(struct super_block *sb,
                .tagged_writepages      = work->tagged_writepages,
                .for_kupdate            = work->for_kupdate,
                .for_background         = work->for_background,
+                .for_sync               = work->for_sync,
                .range_cyclic           = work->range_cyclic,
                .range_start            = 0,
                .range_end              = LLONG_MAX,
@@ -959,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 /*
 * Retrieve work items and do the writeback they describe
 */
-long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+static long wb_do_writeback(struct bdi_writeback *wb)
 {
        struct backing_dev_info *bdi = wb->bdi;
        struct wb_writeback_work *work;
@@ -967,12 +971,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
        set_bit(BDI_writeback_running, &wb->bdi->state);
        while ((work = get_next_work_item(bdi)) != NULL) {
-                /*
-                 * Override sync mode, in case we must wait for completion
-                 * because this thread is exiting now.
-                 */
-                if (force_wait)
-                        work->sync_mode = WB_SYNC_ALL;
                trace_writeback_exec(bdi, work);
@@ -1021,7 +1019,7 @@ void bdi_writeback_workfn(struct work_struct *work)
                 * rescuer as work_list needs to be drained.
                 */
                do {
-                        pages_written = wb_do_writeback(wb, 0);
+                        pages_written = wb_do_writeback(wb);
                        trace_writeback_pages_written(pages_written);
                } while (!list_empty(&bdi->work_list));
        } else {
@@ -1362,6 +1360,7 @@ void sync_inodes_sb(struct super_block *sb)
                .range_cyclic   = 0,
                .done           = &done,
                .reason         = WB_REASON_SYNC,
+                .for_sync       = 1,
        };
        /* Nothing to do? */
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index b52aed1dca97..f7cff367db7f 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -115,7 +115,7 @@ struct fscache_cache *fscache_select_cache_for_object(
                                     struct fscache_object, cookie_link);
                cache = object->cache;
-                if (object->state >= FSCACHE_OBJECT_DYING ||
+                if (fscache_object_is_dying(object) ||
                    test_bit(FSCACHE_IOERROR, &cache->flags))
                        cache = NULL;
@@ -224,8 +224,10 @@ int fscache_add_cache(struct fscache_cache *cache,
        BUG_ON(!ifsdef);
        cache->flags = 0;
-        ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+        ifsdef->event_mask =
-        ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+                ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) &
+                ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+        __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags);
        if (!tagname)
                tagname = cache->identifier;
@@ -330,25 +332,25 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache,
 {
        struct fscache_object *object;
-        spin_lock(&cache->object_list_lock);
        while (!list_empty(&cache->object_list)) {
-                object = list_entry(cache->object_list.next,
+                spin_lock(&cache->object_list_lock);
-                                    struct fscache_object, cache_link);
-                list_move_tail(&object->cache_link, dying_objects);
-                _debug("withdraw %p", object->cookie);
+                if (!list_empty(&cache->object_list)) {
+                        object = list_entry(cache->object_list.next,
+                                            struct fscache_object, cache_link);
+                        list_move_tail(&object->cache_link, dying_objects);
-                spin_lock(&object->lock);
+                        _debug("withdraw %p", object->cookie);
-                spin_unlock(&cache->object_list_lock);
-                fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
+                        /* This must be done under object_list_lock to prevent
-                spin_unlock(&object->lock);
+                         * a race with fscache_drop_object().
+                         */
+                        fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+                }
+                spin_unlock(&cache->object_list_lock);
                cond_resched();
-                spin_lock(&cache->object_list_lock);
        }
-        spin_unlock(&cache->object_list_lock);
 }
 /**
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index e2cba1f60c21..0e91a3c9fdb2 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -95,6 +95,11 @@ struct fscache_cookie *__fscache_acquire_cookie(
        atomic_set(&cookie->usage, 1);
        atomic_set(&cookie->n_children, 0);
+        /* We keep the active count elevated until relinquishment to prevent an
+         * attempt to wake up every time the object operations queue quiesces.
+         */
+        atomic_set(&cookie->n_active, 1);
        atomic_inc(&parent->usage);
        atomic_inc(&parent->n_children);
@@ -177,7 +182,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
        cookie->flags =
                (1 << FSCACHE_COOKIE_LOOKING_UP) |
-                (1 << FSCACHE_COOKIE_CREATING) |
                (1 << FSCACHE_COOKIE_NO_DATA_YET);
        /* ask the cache to allocate objects for this cookie and its parent
@@ -205,7 +209,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
        /* initiate the process of looking up all the objects in the chain
         * (done by fscache_initialise_object()) */
-        fscache_enqueue_object(object);
+        fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD);
        spin_unlock(&cookie->lock);
@@ -285,7 +289,7 @@ static int fscache_alloc_object(struct fscache_cache *cache,
 object_already_extant:
        ret = -ENOBUFS;
-        if (object->state >= FSCACHE_OBJECT_DYING) {
+        if (fscache_object_is_dead(object)) {
                spin_unlock(&cookie->lock);
                goto error;
        }
@@ -321,7 +325,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
        ret = -EEXIST;
        hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {
                if (p->cache == object->cache) {
-                        if (p->state >= FSCACHE_OBJECT_DYING)
+                        if (fscache_object_is_dying(p))
                                ret = -ENOBUFS;
                        goto cant_attach_object;
                }
@@ -332,7 +336,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
        hlist_for_each_entry(p, &cookie->parent->backing_objects,
                             cookie_link) {
                if (p->cache == object->cache) {
-                        if (p->state >= FSCACHE_OBJECT_DYING) {
+                        if (fscache_object_is_dying(p)) {
                                ret = -ENOBUFS;
                                spin_unlock(&cookie->parent->lock);
                                goto cant_attach_object;
@@ -400,7 +404,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
                        object = hlist_entry(cookie->backing_objects.first,
                                             struct fscache_object,
                                             cookie_link);
-                        if (object->state < FSCACHE_OBJECT_DYING)
+                        if (fscache_object_is_live(object))
                                fscache_raise_event(
                                        object, FSCACHE_OBJECT_EV_INVALIDATE);
                }
@@ -467,9 +471,7 @@ EXPORT_SYMBOL(__fscache_update_cookie);
 */
 void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 {
-        struct fscache_cache *cache;
        struct fscache_object *object;
-        unsigned long event;
        fscache_stat(&fscache_n_relinquishes);
        if (retire)
@@ -481,8 +483,11 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
                return;
        }
-        _enter("%p{%s,%p},%d",
+        _enter("%p{%s,%p,%d},%d",
-               cookie, cookie->def->name, cookie->netfs_data, retire);
+               cookie, cookie->def->name, cookie->netfs_data,
+               atomic_read(&cookie->n_active), retire);
+        ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
        if (atomic_read(&cookie->n_children) != 0) {
                printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
@@ -490,62 +495,28 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
                BUG();
        }
-        /* wait for the cookie to finish being instantiated (or to fail) */
+        /* No further netfs-accessing operations on this cookie permitted */
-        if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
+        set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
-                fscache_stat(&fscache_n_relinquishes_waitcrt);
+        if (retire)
-                wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
+                set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
-                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-        }
-        event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
-try_again:
        spin_lock(&cookie->lock);
+        hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
-        /* break links with all the active objects */
+                fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
-        while (!hlist_empty(&cookie->backing_objects)) {
-                int n_reads;
-                object = hlist_entry(cookie->backing_objects.first,
-                                     struct fscache_object,
-                                     cookie_link);
-                _debug("RELEASE OBJ%x", object->debug_id);
-                set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
-                n_reads = atomic_read(&object->n_reads);
-                if (n_reads) {
-                        int n_ops = object->n_ops;
-                        int n_in_progress = object->n_in_progress;
-                        spin_unlock(&cookie->lock);
-                        printk(KERN_ERR "FS-Cache:"
-                               " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
-                               cookie->def->name,
-                               n_reads, n_ops, n_in_progress);
-                        wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
-                                    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-                        printk("Wait finished\n");
-                        goto try_again;
-                }
-                /* detach each cache object from the object cookie */
-                spin_lock(&object->lock);
-                hlist_del_init(&object->cookie_link);
-                cache = object->cache;
-                object->cookie = NULL;
-                fscache_raise_event(object, event);
-                spin_unlock(&object->lock);
-                if (atomic_dec_and_test(&cookie->usage))
-                        /* the cookie refcount shouldn't be reduced to 0 yet */
-                        BUG();
        }
+        spin_unlock(&cookie->lock);
-        /* detach pointers back to the netfs */
+        /* Wait for cessation of activity requiring access to the netfs (when
+         * n_active reaches 0).
+         */
+        if (!atomic_dec_and_test(&cookie->n_active))
+                wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+                                 TASK_UNINTERRUPTIBLE);
+        /* Clear pointers back to the netfs */
        cookie->netfs_data      = NULL;
        cookie->def             = NULL;
+        BUG_ON(cookie->stores.rnode);
-        spin_unlock(&cookie->lock);
        if (cookie->parent) {
                ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
@@ -553,7 +524,7 @@ try_again:
                atomic_dec(&cookie->parent->n_children);
        }
-        /* finally dispose of the cookie */
+        /* Dispose of the netfs's link to the cookie */
        ASSERTCMP(atomic_read(&cookie->usage), >, 0);
        fscache_cookie_put(cookie);
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index f5b4baee7352..10a2ade0bdf8 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -55,6 +55,7 @@ static struct fscache_cookie_def fscache_fsdef_index_def = {
 struct fscache_cookie fscache_fsdef_index = {
        .usage          = ATOMIC_INIT(1),
+        .n_active       = ATOMIC_INIT(1),
        .lock           = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
        .backing_objects = HLIST_HEAD_INIT,
        .def            = &fscache_fsdef_index_def,
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index ee38fef4be51..12d505bedb5c 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -93,14 +93,11 @@ static inline bool fscache_object_congested(void)
 extern int fscache_wait_bit(void *);
 extern int fscache_wait_bit_interruptible(void *);
+extern int fscache_wait_atomic_t(atomic_t *);
 /*
 * object.c
 */
-extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
-extern void fscache_withdrawing_object(struct fscache_cache *,
-                                       struct fscache_object *);
 extern void fscache_enqueue_object(struct fscache_object *);
 /*
@@ -110,8 +107,10 @@ extern void fscache_enqueue_object(struct fscache_object *);
 extern const struct file_operations fscache_objlist_fops;
 extern void fscache_objlist_add(struct fscache_object *);
+extern void fscache_objlist_remove(struct fscache_object *);
 #else
 #define fscache_objlist_add(object) do {} while(0)
+#define fscache_objlist_remove(object) do {} while(0)
 #endif
 /*
@@ -291,6 +290,10 @@ static inline void fscache_raise_event(struct fscache_object *object,
                                       unsigned event)
 {
        BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
+#if 0
+        printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n",
+               object->debug_id, object->event_mask, (1 << event));
+#endif
        if (!test_and_set_bit(event, &object->events) &&
            test_bit(event, &object->event_mask))
                fscache_enqueue_object(object);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index f9d856773f79..7c27907e650c 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -205,7 +205,6 @@ int fscache_wait_bit(void *flags)
        schedule();
        return 0;
 }
-EXPORT_SYMBOL(fscache_wait_bit);
 /*
 * wait_on_bit() sleep function for interruptible waiting
@@ -215,4 +214,12 @@ int fscache_wait_bit_interruptible(void *flags)
        schedule();
        return signal_pending(current);
 }
-EXPORT_SYMBOL(fscache_wait_bit_interruptible);
+/*
+ * wait_on_atomic_t() sleep function for uninterruptible waiting
+ */
+int fscache_wait_atomic_t(atomic_t *p)
+{
+        schedule();
+        return 0;
+}
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index e028b8eb1c40..b1bb6117473a 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -40,6 +40,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
        /* initialise the primary index cookie */
        atomic_set(&netfs->primary_index->usage, 1);
        atomic_set(&netfs->primary_index->n_children, 0);
+        atomic_set(&netfs->primary_index->n_active, 1);
        netfs->primary_index->def               = &fscache_fsdef_netfs_def;
        netfs->primary_index->parent            = &fscache_fsdef_index;
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index f27c89d17885..e1959efad64f 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -70,13 +70,10 @@ void fscache_objlist_add(struct fscache_object *obj)
        write_unlock(&fscache_object_list_lock);
 }
-/**
+/*
- * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * Remove an object from the object list.
- * @object: The object to be destroyed
- *
- * Note the imminent destruction and deallocation of a cache object record.
 */
-void fscache_object_destroy(struct fscache_object *obj)
+void fscache_objlist_remove(struct fscache_object *obj)
 {
        write_lock(&fscache_object_list_lock);
@@ -85,7 +82,6 @@ void fscache_object_destroy(struct fscache_object *obj)
        write_unlock(&fscache_object_list_lock);
 }
-EXPORT_SYMBOL(fscache_object_destroy);
 /*
 * find the object in the tree on or after the specified index
@@ -166,15 +162,14 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 {
        struct fscache_objlist_data *data = m->private;
        struct fscache_object *obj = v;
+        struct fscache_cookie *cookie;
        unsigned long config = data->config;
-        uint16_t keylen, auxlen;
        char _type[3], *type;
-        bool no_cookie;
        u8 *buf = data->buf, *p;
        if ((unsigned long) v == 1) {
                seq_puts(m, "OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS"
-                         " EM EV F S"
+                         " EM EV FL S"
                         " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
                if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
                              FSCACHE_OBJLIST_CONFIG_AUX))
@@ -193,7 +188,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
        if ((unsigned long) v == 2) {
                seq_puts(m, "======== ======== ==== ===== === === === == ====="
-                         " == == = ="
+                         " == == == ="
                         " | ================ == == ================");
                if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
                              FSCACHE_OBJLIST_CONFIG_AUX))
@@ -216,10 +211,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
                }                                                       \
        } while(0)
+        cookie = obj->cookie;
        if (~config) {
-                FILTER(obj->cookie,
+                FILTER(cookie->def,
                       COOKIE, NOCOOKIE);
-                FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
+                FILTER(fscache_object_is_active(obj) ||
                       obj->n_ops != 0 ||
                       obj->n_obj_ops != 0 ||
                       obj->flags ||
@@ -235,10 +231,10 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
        }
        seq_printf(m,
-                   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
+                   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ",
                   obj->debug_id,
                   obj->parent ? obj->parent->debug_id : -1,
-                   fscache_object_states_short[obj->state],
+                   obj->state->short_name,
                   obj->n_children,
                   obj->n_ops,
                   obj->n_obj_ops,
@@ -250,48 +246,40 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
                   obj->flags,
                   work_busy(&obj->work));
-        no_cookie = true;
+        if (fscache_use_cookie(obj)) {
-        keylen = auxlen = 0;
+                uint16_t keylen = 0, auxlen = 0;
-        if (obj->cookie) {
-                spin_lock(&obj->lock);
-                if (obj->cookie) {
-                        switch (obj->cookie->def->type) {
-                        case 0:
-                                type = "IX";
-                                break;
-                        case 1:
-                                type = "DT";
-                                break;
-                        default:
-                                sprintf(_type, "%02u",
-                                        obj->cookie->def->type);
-                                type = _type;
-                                break;
-                        }
-                        seq_printf(m, "%-16s %s %2lx %16p",
+                switch (cookie->def->type) {
-                                   obj->cookie->def->name,
+                case 0:
-                                   type,
+                        type = "IX";
-                                   obj->cookie->flags,
+                        break;
-                                   obj->cookie->netfs_data);
+                case 1:
+                        type = "DT";
-                        if (obj->cookie->def->get_key &&
+                        break;
-                            config & FSCACHE_OBJLIST_CONFIG_KEY)
+                default:
-                                keylen = obj->cookie->def->get_key(
+                        sprintf(_type, "%02u", cookie->def->type);
-                                        obj->cookie->netfs_data,
+                        type = _type;
-                                        buf, 400);
+                        break;
-                        if (obj->cookie->def->get_aux &&
-                            config & FSCACHE_OBJLIST_CONFIG_AUX)
-                                auxlen = obj->cookie->def->get_aux(
-                                        obj->cookie->netfs_data,
-                                        buf + keylen, 512 - keylen);
-                        no_cookie = false;
                }
-                spin_unlock(&obj->lock);
-                if (!no_cookie && (keylen > 0 || auxlen > 0)) {
+                seq_printf(m, "%-16s %s %2lx %16p",
+                           cookie->def->name,
+                           type,
+                           cookie->flags,
+                           cookie->netfs_data);
+                if (cookie->def->get_key &&
+                    config & FSCACHE_OBJLIST_CONFIG_KEY)
+                        keylen = cookie->def->get_key(cookie->netfs_data,
+                                                      buf, 400);
+                if (cookie->def->get_aux &&
+                    config & FSCACHE_OBJLIST_CONFIG_AUX)
+                        auxlen = cookie->def->get_aux(cookie->netfs_data,
+                                                      buf + keylen, 512 - keylen);
+                fscache_unuse_cookie(obj);
+                if (keylen > 0 || auxlen > 0) {
                        seq_printf(m, " ");
                        for (p = buf; keylen > 0; keylen--)
                                seq_printf(m, "%02x", *p++);
@@ -302,12 +290,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
                                        seq_printf(m, "%02x", *p++);
                        }
                }
-        }
-        if (no_cookie)
-                seq_printf(m, "<no_cookie>\n");
-        else
                seq_printf(m, "\n");
+        } else {
+                seq_printf(m, "<no_netfs>\n");
+        }
        return 0;
 }
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 50d41c180211..86d75a60b20c 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -15,52 +15,131 @@
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/prefetch.h>
 #include "internal.h"
-const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int);
-        [FSCACHE_OBJECT_INIT]           = "OBJECT_INIT",
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int);
-        [FSCACHE_OBJECT_LOOKING_UP]     = "OBJECT_LOOKING_UP",
+static const struct fscache_state *fscache_drop_object(struct fscache_object *, int);
-        [FSCACHE_OBJECT_CREATING]       = "OBJECT_CREATING",
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int);
-        [FSCACHE_OBJECT_AVAILABLE]      = "OBJECT_AVAILABLE",
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int);
-        [FSCACHE_OBJECT_ACTIVE]         = "OBJECT_ACTIVE",
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int);
-        [FSCACHE_OBJECT_INVALIDATING]   = "OBJECT_INVALIDATING",
+static const struct fscache_state *fscache_kill_object(struct fscache_object *, int);
-        [FSCACHE_OBJECT_UPDATING]       = "OBJECT_UPDATING",
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int);
-        [FSCACHE_OBJECT_DYING]          = "OBJECT_DYING",
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int);
-        [FSCACHE_OBJECT_LC_DYING]       = "OBJECT_LC_DYING",
+static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
-        [FSCACHE_OBJECT_ABORT_INIT]     = "OBJECT_ABORT_INIT",
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
-        [FSCACHE_OBJECT_RELEASING]      = "OBJECT_RELEASING",
+static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
-        [FSCACHE_OBJECT_RECYCLING]      = "OBJECT_RECYCLING",
-        [FSCACHE_OBJECT_WITHDRAWING]    = "OBJECT_WITHDRAWING",
+#define __STATE_NAME(n) fscache_osm_##n
-        [FSCACHE_OBJECT_DEAD]           = "OBJECT_DEAD",
+#define STATE(n) (&__STATE_NAME(n))
+/*
+ * Define a work state.  Work states are execution states.  No event processing
+ * is performed by them.  The function attached to a work state returns a
+ * pointer indicating the next state to which the state machine should
+ * transition.  Returning NO_TRANSIT repeats the current state, but goes back
+ * to the scheduler first.
+ */
+#define WORK_STATE(n, sn, f) \
+        const struct fscache_state __STATE_NAME(n) = {                  \
+                .name = #n,                                             \
+                .short_name = sn,                                       \
+                .work = f                                               \
+        }
+/*
+ * Returns from work states.
+ */
+#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); })
+#define NO_TRANSIT ((struct fscache_state *)NULL)
+/*
+ * Define a wait state.  Wait states are event processing states.  No execution
+ * is performed by them.  Wait states are just tables of "if event X occurs,
+ * clear it and transition to state Y".  The dispatcher returns to the
+ * scheduler if none of the events in which the wait state has an interest are
+ * currently pending.
+ */
+#define WAIT_STATE(n, sn, ...) \
+        const struct fscache_state __STATE_NAME(n) = {                  \
+                .name = #n,                                             \
+                .short_name = sn,                                       \
+                .work = NULL,                                           \
+                .transitions = { __VA_ARGS__, { 0, NULL } }             \
+        }
+#define TRANSIT_TO(state, emask) \
+        { .events = (emask), .transit_to = STATE(state) }
+/*
+ * The object state machine.
+ */
+static WORK_STATE(INIT_OBJECT,          "INIT", fscache_initialise_object);
+static WORK_STATE(PARENT_READY,         "PRDY", fscache_parent_ready);
+static WORK_STATE(ABORT_INIT,           "ABRT", fscache_abort_initialisation);
+static WORK_STATE(LOOK_UP_OBJECT,       "LOOK", fscache_look_up_object);
+static WORK_STATE(CREATE_OBJECT,        "CRTO", fscache_look_up_object);
+static WORK_STATE(OBJECT_AVAILABLE,     "AVBL", fscache_object_available);
+static WORK_STATE(JUMPSTART_DEPS,       "JUMP", fscache_jumpstart_dependents);
+static WORK_STATE(INVALIDATE_OBJECT,    "INVL", fscache_invalidate_object);
+static WORK_STATE(UPDATE_OBJECT,        "UPDT", fscache_update_object);
+static WORK_STATE(LOOKUP_FAILURE,       "LCFL", fscache_lookup_failure);
+static WORK_STATE(KILL_OBJECT,          "KILL", fscache_kill_object);
+static WORK_STATE(KILL_DEPENDENTS,      "KDEP", fscache_kill_dependents);
+static WORK_STATE(DROP_OBJECT,          "DROP", fscache_drop_object);
+static WORK_STATE(OBJECT_DEAD,          "DEAD", (void*)2UL);
+static WAIT_STATE(WAIT_FOR_INIT,        "?INI",
+                  TRANSIT_TO(INIT_OBJECT,       1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+static WAIT_STATE(WAIT_FOR_PARENT,      "?PRN",
+                  TRANSIT_TO(PARENT_READY,      1 << FSCACHE_OBJECT_EV_PARENT_READY));
+static WAIT_STATE(WAIT_FOR_CMD,         "?CMD",
+                  TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE),
+                  TRANSIT_TO(UPDATE_OBJECT,     1 << FSCACHE_OBJECT_EV_UPDATE),
+                  TRANSIT_TO(JUMPSTART_DEPS,    1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+static WAIT_STATE(WAIT_FOR_CLEARANCE,   "?CLR",
+                  TRANSIT_TO(KILL_OBJECT,       1 << FSCACHE_OBJECT_EV_CLEARED));
+/*
+ * Out-of-band event transition tables.  These are for handling unexpected
+ * events, such as an I/O error.  If an OOB event occurs, the state machine
+ * clears and disables the event and forces a transition to the nominated work
+ * state (acurrently executing work states will complete first).
+ *
+ * In such a situation, object->state remembers the state the machine should
+ * have been in/gone to and returning NO_TRANSIT returns to that.
+ */
+static const struct fscache_transition fscache_osm_init_oob[] = {
+           TRANSIT_TO(ABORT_INIT,
+                      (1 << FSCACHE_OBJECT_EV_ERROR) |
+                      (1 << FSCACHE_OBJECT_EV_KILL)),
+           { 0, NULL }
+};
+static const struct fscache_transition fscache_osm_lookup_oob[] = {
+           TRANSIT_TO(LOOKUP_FAILURE,
+                      (1 << FSCACHE_OBJECT_EV_ERROR) |
+                      (1 << FSCACHE_OBJECT_EV_KILL)),
+           { 0, NULL }
 };
-EXPORT_SYMBOL(fscache_object_states);
+static const struct fscache_transition fscache_osm_run_oob[] = {
-const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
+           TRANSIT_TO(KILL_OBJECT,
-        [FSCACHE_OBJECT_INIT]           = "INIT",
+                      (1 << FSCACHE_OBJECT_EV_ERROR) |
-        [FSCACHE_OBJECT_LOOKING_UP]     = "LOOK",
+                      (1 << FSCACHE_OBJECT_EV_KILL)),
-        [FSCACHE_OBJECT_CREATING]       = "CRTN",
+           { 0, NULL }
-        [FSCACHE_OBJECT_AVAILABLE]      = "AVBL",
-        [FSCACHE_OBJECT_ACTIVE]         = "ACTV",
-        [FSCACHE_OBJECT_INVALIDATING]   = "INVL",
-        [FSCACHE_OBJECT_UPDATING]       = "UPDT",
-        [FSCACHE_OBJECT_DYING]          = "DYNG",
-        [FSCACHE_OBJECT_LC_DYING]       = "LCDY",
-        [FSCACHE_OBJECT_ABORT_INIT]     = "ABTI",
-        [FSCACHE_OBJECT_RELEASING]      = "RELS",
-        [FSCACHE_OBJECT_RECYCLING]      = "RCYC",
-        [FSCACHE_OBJECT_WITHDRAWING]    = "WTHD",
-        [FSCACHE_OBJECT_DEAD]           = "DEAD",
 };
 static int  fscache_get_object(struct fscache_object *);
 static void fscache_put_object(struct fscache_object *);
-static void fscache_initialise_object(struct fscache_object *);
+static bool fscache_enqueue_dependents(struct fscache_object *, int);
-static void fscache_lookup_object(struct fscache_object *);
-static void fscache_object_available(struct fscache_object *);
-static void fscache_invalidate_object(struct fscache_object *);
-static void fscache_release_object(struct fscache_object *);
-static void fscache_withdraw_object(struct fscache_object *);
-static void fscache_enqueue_dependents(struct fscache_object *);
 static void fscache_dequeue_object(struct fscache_object *);
 /*
@@ -75,295 +154,116 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
               object->debug_id, parent->debug_id, parent->n_ops);
        spin_lock_nested(&parent->lock, 1);
-        parent->n_ops--;
        parent->n_obj_ops--;
+        parent->n_ops--;
        if (parent->n_ops == 0)
                fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
        spin_unlock(&parent->lock);
 }
 /*
- * Notify netfs of invalidation completion.
+ * Object state machine dispatcher.
 */
-static inline void fscache_invalidation_complete(struct fscache_cookie *cookie)
+static void fscache_object_sm_dispatcher(struct fscache_object *object)
 {
-        if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+        const struct fscache_transition *t;
-                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+        const struct fscache_state *state, *new_state;
-}
+        unsigned long events, event_mask;
+        int event = -1;
-/*
- * process events that have been sent to an object's state machine
- * - initiates parent lookup
- * - does object lookup
- * - does object creation
- * - does object recycling and retirement
- * - does object withdrawal
- */
-static void fscache_object_state_machine(struct fscache_object *object)
-{
-        enum fscache_object_state new_state;
-        struct fscache_cookie *cookie;
-        int event;
        ASSERT(object != NULL);
        _enter("{OBJ%x,%s,%lx}",
-               object->debug_id, fscache_object_states[object->state],
+               object->debug_id, object->state->name, object->events);
-               object->events);
+        event_mask = object->event_mask;
-        switch (object->state) {
+restart:
-                /* wait for the parent object to become ready */
+        object->event_mask = 0; /* Mask normal event handling */
-        case FSCACHE_OBJECT_INIT:
+        state = object->state;
-                object->event_mask =
+restart_masked:
-                        FSCACHE_OBJECT_EVENTS_MASK &
+        events = object->events;
-                        ~(1 << FSCACHE_OBJECT_EV_CLEARED);
-                fscache_initialise_object(object);
+        /* Handle any out-of-band events (typically an error) */
-                goto done;
+        if (events & object->oob_event_mask) {
+                _debug("{OBJ%x} oob %lx",
-                /* look up the object metadata on disk */
+                       object->debug_id, events & object->oob_event_mask);
-        case FSCACHE_OBJECT_LOOKING_UP:
+                for (t = object->oob_table; t->events; t++) {
-                fscache_lookup_object(object);
+                        if (events & t->events) {
-                goto lookup_transit;
+                                state = t->transit_to;
+                                ASSERT(state->work != NULL);
-                /* create the object metadata on disk */
+                                event = fls(events & t->events) - 1;
-        case FSCACHE_OBJECT_CREATING:
+                                __clear_bit(event, &object->oob_event_mask);
-                fscache_lookup_object(object);
+                                clear_bit(event, &object->events);
-                goto lookup_transit;
+                                goto execute_work_state;
+                        }
-                /* handle an object becoming available; start pending
-                 * operations and queue dependent operations for processing */
-        case FSCACHE_OBJECT_AVAILABLE:
-                fscache_object_available(object);
-                goto active_transit;
-                /* normal running state */
-        case FSCACHE_OBJECT_ACTIVE:
-                goto active_transit;
-                /* Invalidate an object on disk */
-        case FSCACHE_OBJECT_INVALIDATING:
-                clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
-                fscache_stat(&fscache_n_invalidates_run);
-                fscache_stat(&fscache_n_cop_invalidate_object);
-                fscache_invalidate_object(object);
-                fscache_stat_d(&fscache_n_cop_invalidate_object);
-                fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
-                goto active_transit;
-                /* update the object metadata on disk */
-        case FSCACHE_OBJECT_UPDATING:
-                clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
-                fscache_stat(&fscache_n_updates_run);
-                fscache_stat(&fscache_n_cop_update_object);
-                object->cache->ops->update_object(object);
-                fscache_stat_d(&fscache_n_cop_update_object);
-                goto active_transit;
-                /* handle an object dying during lookup or creation */
-        case FSCACHE_OBJECT_LC_DYING:
-                object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
-                fscache_stat(&fscache_n_cop_lookup_complete);
-                object->cache->ops->lookup_complete(object);
-                fscache_stat_d(&fscache_n_cop_lookup_complete);
-                spin_lock(&object->lock);
-                object->state = FSCACHE_OBJECT_DYING;
-                cookie = object->cookie;
-                if (cookie) {
-                        if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
-                                               &cookie->flags))
-                                wake_up_bit(&cookie->flags,
-                                            FSCACHE_COOKIE_LOOKING_UP);
-                        if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
-                                               &cookie->flags))
-                                wake_up_bit(&cookie->flags,
-                                            FSCACHE_COOKIE_CREATING);
                }
-                spin_unlock(&object->lock);
+        }
-                fscache_done_parent_op(object);
+        /* Wait states are just transition tables */
+        if (!state->work) {
+                if (events & event_mask) {
+                        for (t = state->transitions; t->events; t++) {
+                                if (events & t->events) {
+                                        new_state = t->transit_to;
+                                        event = fls(events & t->events) - 1;
+                                        clear_bit(event, &object->events);
+                                        _debug("{OBJ%x} ev %d: %s -> %s",
+                                               object->debug_id, event,
+                                               state->name, new_state->name);
+                                        object->state = state = new_state;
+                                        goto execute_work_state;
+                                }
+                        }
-                /* wait for completion of all active operations on this object
+                        /* The event mask didn't include all the tabled bits */
-                 * and the death of all child objects of this object */
+                        BUG();
-        case FSCACHE_OBJECT_DYING:
-        dying:
-                clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
-                spin_lock(&object->lock);
-                _debug("dying OBJ%x {%d,%d}",
-                       object->debug_id, object->n_ops, object->n_children);
-                if (object->n_ops == 0 && object->n_children == 0) {
-                        object->event_mask &=
-                                ~(1 << FSCACHE_OBJECT_EV_CLEARED);
-                        object->event_mask |=
-                                (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-                                (1 << FSCACHE_OBJECT_EV_RETIRE) |
-                                (1 << FSCACHE_OBJECT_EV_RELEASE) |
-                                (1 << FSCACHE_OBJECT_EV_ERROR);
-                } else {
-                        object->event_mask &=
-                                ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-                                  (1 << FSCACHE_OBJECT_EV_RETIRE) |
-                                  (1 << FSCACHE_OBJECT_EV_RELEASE) |
-                                  (1 << FSCACHE_OBJECT_EV_ERROR));
-                        object->event_mask |=
-                                1 << FSCACHE_OBJECT_EV_CLEARED;
                }
-                spin_unlock(&object->lock);
+                /* Randomly woke up */
-                fscache_enqueue_dependents(object);
+                goto unmask_events;
-                fscache_start_operations(object);
-                goto terminal_transit;
-                /* handle an abort during initialisation */
-        case FSCACHE_OBJECT_ABORT_INIT:
-                _debug("handle abort init %lx", object->events);
-                object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
-                spin_lock(&object->lock);
-                fscache_dequeue_object(object);
-                object->state = FSCACHE_OBJECT_DYING;
-                if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
-                                       &object->cookie->flags))
-                        wake_up_bit(&object->cookie->flags,
-                                    FSCACHE_COOKIE_CREATING);
-                spin_unlock(&object->lock);
-                goto dying;
-                /* handle the netfs releasing an object and possibly marking it
-                 * obsolete too */
-        case FSCACHE_OBJECT_RELEASING:
-        case FSCACHE_OBJECT_RECYCLING:
-                object->event_mask &=
-                        ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-                          (1 << FSCACHE_OBJECT_EV_RETIRE) |
-                          (1 << FSCACHE_OBJECT_EV_RELEASE) |
-                          (1 << FSCACHE_OBJECT_EV_ERROR));
-                fscache_release_object(object);
-                spin_lock(&object->lock);
-                object->state = FSCACHE_OBJECT_DEAD;
-                spin_unlock(&object->lock);
-                fscache_stat(&fscache_n_object_dead);
-                goto terminal_transit;
-                /* handle the parent cache of this object being withdrawn from
-                 * active service */
-        case FSCACHE_OBJECT_WITHDRAWING:
-                object->event_mask &=
-                        ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-                          (1 << FSCACHE_OBJECT_EV_RETIRE) |
-                          (1 << FSCACHE_OBJECT_EV_RELEASE) |
-                          (1 << FSCACHE_OBJECT_EV_ERROR));
-                fscache_withdraw_object(object);
-                spin_lock(&object->lock);
-                object->state = FSCACHE_OBJECT_DEAD;
-                spin_unlock(&object->lock);
-                fscache_stat(&fscache_n_object_dead);
-                goto terminal_transit;
-                /* complain about the object being woken up once it is
-                 * deceased */
-        case FSCACHE_OBJECT_DEAD:
-                printk(KERN_ERR "FS-Cache:"
-                       " Unexpected event in dead state %lx\n",
-                       object->events & object->event_mask);
-                BUG();
-        default:
-                printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
-                       object->state);
-                BUG();
-        }
-        /* determine the transition from a lookup state */
-lookup_transit:
-        event = fls(object->events & object->event_mask) - 1;
-        switch (event) {
-        case FSCACHE_OBJECT_EV_WITHDRAW:
-        case FSCACHE_OBJECT_EV_RETIRE:
-        case FSCACHE_OBJECT_EV_RELEASE:
-        case FSCACHE_OBJECT_EV_ERROR:
-                new_state = FSCACHE_OBJECT_LC_DYING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_INVALIDATE:
-                new_state = FSCACHE_OBJECT_INVALIDATING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_REQUEUE:
-                goto done;
-        case -1:
-                goto done; /* sleep until event */
-        default:
-                goto unsupported_event;
        }
-        /* determine the transition from an active state */
+execute_work_state:
-active_transit:
+        _debug("{OBJ%x} exec %s", object->debug_id, state->name);
-        event = fls(object->events & object->event_mask) - 1;
-        switch (event) {
-        case FSCACHE_OBJECT_EV_WITHDRAW:
-        case FSCACHE_OBJECT_EV_RETIRE:
-        case FSCACHE_OBJECT_EV_RELEASE:
-        case FSCACHE_OBJECT_EV_ERROR:
-                new_state = FSCACHE_OBJECT_DYING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_INVALIDATE:
-                new_state = FSCACHE_OBJECT_INVALIDATING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_UPDATE:
-                new_state = FSCACHE_OBJECT_UPDATING;
-                goto change_state;
-        case -1:
-                new_state = FSCACHE_OBJECT_ACTIVE;
-                goto change_state; /* sleep until event */
-        default:
-                goto unsupported_event;
-        }
-        /* determine the transition from a terminal state */
+        new_state = state->work(object, event);
-terminal_transit:
+        event = -1;
-        event = fls(object->events & object->event_mask) - 1;
+        if (new_state == NO_TRANSIT) {
-        switch (event) {
+                _debug("{OBJ%x} %s notrans", object->debug_id, state->name);
-        case FSCACHE_OBJECT_EV_WITHDRAW:
+                fscache_enqueue_object(object);
-                new_state = FSCACHE_OBJECT_WITHDRAWING;
+                event_mask = object->oob_event_mask;
-                goto change_state;
+                goto unmask_events;
-        case FSCACHE_OBJECT_EV_RETIRE:
-                new_state = FSCACHE_OBJECT_RECYCLING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_RELEASE:
-                new_state = FSCACHE_OBJECT_RELEASING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_ERROR:
-                new_state = FSCACHE_OBJECT_WITHDRAWING;
-                goto change_state;
-        case FSCACHE_OBJECT_EV_CLEARED:
-                new_state = FSCACHE_OBJECT_DYING;
-                goto change_state;
-        case -1:
-                goto done; /* sleep until event */
-        default:
-                goto unsupported_event;
        }
-change_state:
+        _debug("{OBJ%x} %s -> %s",
-        spin_lock(&object->lock);
+               object->debug_id, state->name, new_state->name);
-        object->state = new_state;
+        object->state = state = new_state;
-        spin_unlock(&object->lock);
-done:
+        if (state->work) {
-        _leave(" [->%s]", fscache_object_states[object->state]);
+                if (unlikely(state->work == ((void *)2UL))) {
-        return;
+                        _leave(" [dead]");
+                        return;
+                }
+                goto restart_masked;
+        }
-unsupported_event:
+        /* Transited to wait state */
-        printk(KERN_ERR "FS-Cache:"
+        event_mask = object->oob_event_mask;
-               " Unsupported event %d [%lx/%lx] in state %s\n",
+        for (t = state->transitions; t->events; t++)
-               event, object->events, object->event_mask,
+                event_mask |= t->events;
-               fscache_object_states[object->state]);
-        BUG();
+unmask_events:
+        object->event_mask = event_mask;
+        smp_mb();
+        events = object->events;
+        if (events & event_mask)
+                goto restart;
+        _leave(" [msk %lx]", event_mask);
 }
 /*
 * execute an object
 */
-void fscache_object_work_func(struct work_struct *work)
+static void fscache_object_work_func(struct work_struct *work)
 {
        struct fscache_object *object =
                container_of(work, struct fscache_object, work);
@@ -372,14 +272,70 @@ void fscache_object_work_func(struct work_struct *work)
        _enter("{OBJ%x}", object->debug_id);
        start = jiffies;
-        fscache_object_state_machine(object);
+        fscache_object_sm_dispatcher(object);
        fscache_hist(fscache_objs_histogram, start);
-        if (object->events & object->event_mask)
-                fscache_enqueue_object(object);
-        clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
        fscache_put_object(object);
 }
-EXPORT_SYMBOL(fscache_object_work_func);
+/**
+ * fscache_object_init - Initialise a cache object description
+ * @object: Object description
+ * @cookie: Cookie object will be attached to
+ * @cache: Cache in which backing object will be found
+ *
+ * Initialise a cache object description to its basic values.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_object_init(struct fscache_object *object,
+                         struct fscache_cookie *cookie,
+                         struct fscache_cache *cache)
+{
+        const struct fscache_transition *t;
+        atomic_inc(&cache->object_count);
+        object->state = STATE(WAIT_FOR_INIT);
+        object->oob_table = fscache_osm_init_oob;
+        object->flags = 1 << FSCACHE_OBJECT_IS_LIVE;
+        spin_lock_init(&object->lock);
+        INIT_LIST_HEAD(&object->cache_link);
+        INIT_HLIST_NODE(&object->cookie_link);
+        INIT_WORK(&object->work, fscache_object_work_func);
+        INIT_LIST_HEAD(&object->dependents);
+        INIT_LIST_HEAD(&object->dep_link);
+        INIT_LIST_HEAD(&object->pending_ops);
+        object->n_children = 0;
+        object->n_ops = object->n_in_progress = object->n_exclusive = 0;
+        object->events = 0;
+        object->store_limit = 0;
+        object->store_limit_l = 0;
+        object->cache = cache;
+        object->cookie = cookie;
+        object->parent = NULL;
+        object->oob_event_mask = 0;
+        for (t = object->oob_table; t->events; t++)
+                object->oob_event_mask |= t->events;
+        object->event_mask = object->oob_event_mask;
+        for (t = object->state->transitions; t->events; t++)
+                object->event_mask |= t->events;
+}
+EXPORT_SYMBOL(fscache_object_init);
+/*
+ * Abort object initialisation before we start it.
+ */
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
+                                                                int event)
+{
+        _enter("{OBJ%x},%d", object->debug_id, event);
+        object->oob_event_mask = 0;
+        fscache_dequeue_object(object);
+        return transit_to(KILL_OBJECT);
+}
 /*
 * initialise an object
@@ -387,130 +343,136 @@ EXPORT_SYMBOL(fscache_object_work_func);
 *   immediately to do a creation
 * - we may need to start the process of creating a parent and we need to wait
 *   for the parent's lookup and creation to complete if it's not there yet
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- *   leaf-most cookies of the object and all its children
 */
-static void fscache_initialise_object(struct fscache_object *object)
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *object,
+                                                             int event)
 {
        struct fscache_object *parent;
+        bool success;
-        _enter("");
+        _enter("{OBJ%x},%d", object->debug_id, event);
-        ASSERT(object->cookie != NULL);
-        ASSERT(object->cookie->parent != NULL);
-        if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
-                              (1 << FSCACHE_OBJECT_EV_RELEASE) |
-                              (1 << FSCACHE_OBJECT_EV_RETIRE) |
-                              (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
-                _debug("abort init %lx", object->events);
-                spin_lock(&object->lock);
-                object->state = FSCACHE_OBJECT_ABORT_INIT;
-                spin_unlock(&object->lock);
-                return;
-        }
-        spin_lock(&object->cookie->lock);
+        ASSERT(list_empty(&object->dep_link));
-        spin_lock_nested(&object->cookie->parent->lock, 1);
        parent = object->parent;
        if (!parent) {
-                _debug("no parent");
+                _leave(" [no parent]");
-                set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+                return transit_to(DROP_OBJECT);
-        } else {
+        }
-                spin_lock(&object->lock);
-                spin_lock_nested(&parent->lock, 1);
-                _debug("parent %s", fscache_object_states[parent->state]);
-                if (parent->state >= FSCACHE_OBJECT_DYING) {
-                        _debug("bad parent");
-                        set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
-                } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
-                        _debug("wait");
-                        /* we may get woken up in this state by child objects
-                         * binding on to us, so we need to make sure we don't
-                         * add ourself to the list multiple times */
-                        if (list_empty(&object->dep_link)) {
-                                fscache_stat(&fscache_n_cop_grab_object);
-                                object->cache->ops->grab_object(object);
-                                fscache_stat_d(&fscache_n_cop_grab_object);
-                                list_add(&object->dep_link,
-                                         &parent->dependents);
-                                /* fscache_acquire_non_index_cookie() uses this
-                                 * to wake the chain up */
-                                if (parent->state == FSCACHE_OBJECT_INIT)
-                                        fscache_enqueue_object(parent);
-                        }
-                } else {
-                        _debug("go");
-                        parent->n_ops++;
-                        parent->n_obj_ops++;
-                        object->lookup_jif = jiffies;
-                        object->state = FSCACHE_OBJECT_LOOKING_UP;
-                        set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-                }
-                spin_unlock(&parent->lock);
+        _debug("parent: %s of:%lx", parent->state->name, parent->flags);
-                spin_unlock(&object->lock);
+        if (fscache_object_is_dying(parent)) {
+                _leave(" [bad parent]");
+                return transit_to(DROP_OBJECT);
        }
-        spin_unlock(&object->cookie->parent->lock);
+        if (fscache_object_is_available(parent)) {
-        spin_unlock(&object->cookie->lock);
+                _leave(" [ready]");
+                return transit_to(PARENT_READY);
+        }
+        _debug("wait");
+        spin_lock(&parent->lock);
+        fscache_stat(&fscache_n_cop_grab_object);
+        success = false;
+        if (fscache_object_is_live(parent) &&
+            object->cache->ops->grab_object(object)) {
+                list_add(&object->dep_link, &parent->dependents);
+                success = true;
+        }
+        fscache_stat_d(&fscache_n_cop_grab_object);
+        spin_unlock(&parent->lock);
+        if (!success) {
+                _leave(" [grab failed]");
+                return transit_to(DROP_OBJECT);
+        }
+        /* fscache_acquire_non_index_cookie() uses this
+         * to wake the chain up */
+        fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD);
+        _leave(" [wait]");
+        return transit_to(WAIT_FOR_PARENT);
+}
+/*
+ * Once the parent object is ready, we should kick off our lookup op.
+ */
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *object,
+                                                        int event)
+{
+        struct fscache_object *parent = object->parent;
+        _enter("{OBJ%x},%d", object->debug_id, event);
+        ASSERT(parent != NULL);
+        spin_lock(&parent->lock);
+        parent->n_ops++;
+        parent->n_obj_ops++;
+        object->lookup_jif = jiffies;
+        spin_unlock(&parent->lock);
        _leave("");
+        return transit_to(LOOK_UP_OBJECT);
 }
 /*
 * look an object up in the cache from which it was allocated
 * - we hold an "access lock" on the parent object, so the parent object cannot
 *   be withdrawn by either party till we've finished
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- *   leaf-most cookies of the object and all its children
 */
-static void fscache_lookup_object(struct fscache_object *object)
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *object,
+                                                          int event)
 {
        struct fscache_cookie *cookie = object->cookie;
-        struct fscache_object *parent;
+        struct fscache_object *parent = object->parent;
        int ret;
-        _enter("");
+        _enter("{OBJ%x},%d", object->debug_id, event);
+        object->oob_table = fscache_osm_lookup_oob;
-        parent = object->parent;
        ASSERT(parent != NULL);
        ASSERTCMP(parent->n_ops, >, 0);
        ASSERTCMP(parent->n_obj_ops, >, 0);
        /* make sure the parent is still available */
-        ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
+        ASSERT(fscache_object_is_available(parent));
-        if (parent->state >= FSCACHE_OBJECT_DYING ||
+        if (fscache_object_is_dying(parent) ||
-            test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+            test_bit(FSCACHE_IOERROR, &object->cache->flags) ||
-                _debug("unavailable");
+            !fscache_use_cookie(object)) {
-                set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+                _leave(" [unavailable]");
-                _leave("");
+                return transit_to(LOOKUP_FAILURE);
-                return;
        }
-        _debug("LOOKUP \"%s/%s\" in \"%s\"",
+        _debug("LOOKUP \"%s\" in \"%s\"",
-               parent->cookie->def->name, cookie->def->name,
+               cookie->def->name, object->cache->tag->name);
-               object->cache->tag->name);
        fscache_stat(&fscache_n_object_lookups);
        fscache_stat(&fscache_n_cop_lookup_object);
        ret = object->cache->ops->lookup_object(object);
        fscache_stat_d(&fscache_n_cop_lookup_object);
-        if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
+        fscache_unuse_cookie(object);
-                set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
        if (ret == -ETIMEDOUT) {
                /* probably stuck behind another object, so move this one to
                 * the back of the queue */
                fscache_stat(&fscache_n_object_lookups_timed_out);
-                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+                _leave(" [timeout]");
+                return NO_TRANSIT;
        }
-        _leave("");
+        if (ret < 0) {
+                _leave(" [error]");
+                return transit_to(LOOKUP_FAILURE);
+        }
+        _leave(" [ok]");
+        return transit_to(OBJECT_AVAILABLE);
 }
 /**
@@ -524,32 +486,20 @@ void fscache_object_lookup_negative(struct fscache_object *object)
 {
        struct fscache_cookie *cookie = object->cookie;
-        _enter("{OBJ%x,%s}",
+        _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
-               object->debug_id, fscache_object_states[object->state]);
-        spin_lock(&object->lock);
+        if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
-        if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
                fscache_stat(&fscache_n_object_lookups_negative);
-                /* transit here to allow write requests to begin stacking up
+                /* Allow write requests to begin stacking up and read requests to begin
-                 * and read requests to begin returning ENODATA */
+                 * returning ENODATA.
-                object->state = FSCACHE_OBJECT_CREATING;
+                 */
-                spin_unlock(&object->lock);
-                set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
                set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
                _debug("wake up lookup %p", &cookie->flags);
-                smp_mb__before_clear_bit();
+                clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-                clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-                smp_mb__after_clear_bit();
                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
-                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-        } else {
-                ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
-                spin_unlock(&object->lock);
        }
        _leave("");
 }
 EXPORT_SYMBOL(fscache_object_lookup_negative);
@@ -568,38 +518,26 @@ void fscache_obtained_object(struct fscache_object *object)
 {
        struct fscache_cookie *cookie = object->cookie;
-        _enter("{OBJ%x,%s}",
+        _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
-               object->debug_id, fscache_object_states[object->state]);
        /* if we were still looking up, then we must have a positive lookup
         * result, in which case there may be data available */
-        spin_lock(&object->lock);
+        if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
-        if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
                fscache_stat(&fscache_n_object_lookups_positive);
-                clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+                /* We do (presumably) have data */
+                clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
-                object->state = FSCACHE_OBJECT_AVAILABLE;
+                /* Allow write requests to begin stacking up and read requests
-                spin_unlock(&object->lock);
+                 * to begin shovelling data.
+                 */
-                smp_mb__before_clear_bit();
+                clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-                clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-                smp_mb__after_clear_bit();
                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
-                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
        } else {
-                ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
                fscache_stat(&fscache_n_object_created);
-                object->state = FSCACHE_OBJECT_AVAILABLE;
-                spin_unlock(&object->lock);
-                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-                smp_wmb();
        }
-        if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
+        set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
-                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
        _leave("");
 }
 EXPORT_SYMBOL(fscache_obtained_object);
@@ -607,15 +545,14 @@ EXPORT_SYMBOL(fscache_obtained_object);
 /*
 * handle an object that has just become available
 */
-static void fscache_object_available(struct fscache_object *object)
+static const struct fscache_state *fscache_object_available(struct fscache_object *object,
+                                                            int event)
 {
-        _enter("{OBJ%x}", object->debug_id);
+        _enter("{OBJ%x},%d", object->debug_id, event);
-        spin_lock(&object->lock);
+        object->oob_table = fscache_osm_run_oob;
-        if (object->cookie &&
+        spin_lock(&object->lock);
-            test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
-                wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
        fscache_done_parent_op(object);
        if (object->n_in_progress == 0) {
@@ -631,130 +568,158 @@ static void fscache_object_available(struct fscache_object *object)
        fscache_stat(&fscache_n_cop_lookup_complete);
        object->cache->ops->lookup_complete(object);
        fscache_stat_d(&fscache_n_cop_lookup_complete);
-        fscache_enqueue_dependents(object);
        fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
        fscache_stat(&fscache_n_object_avail);
        _leave("");
+        return transit_to(JUMPSTART_DEPS);
 }
 /*
- * drop an object's attachments
+ * Wake up this object's dependent objects now that we've become available.
 */
-static void fscache_drop_object(struct fscache_object *object)
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object,
+                                                                int event)
 {
-        struct fscache_object *parent = object->parent;
+        _enter("{OBJ%x},%d", object->debug_id, event);
-        struct fscache_cache *cache = object->cache;
-        _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+        if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY))
+                return NO_TRANSIT; /* Not finished; requeue */
+        return transit_to(WAIT_FOR_CMD);
+}
-        ASSERTCMP(object->cookie, ==, NULL);
+/*
-        ASSERT(hlist_unhashed(&object->cookie_link));
+ * Handle lookup or creation failute.
+ */
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object,
+                                                          int event)
+{
+        struct fscache_cookie *cookie;
-        spin_lock(&cache->object_list_lock);
+        _enter("{OBJ%x},%d", object->debug_id, event);
-        list_del_init(&object->cache_link);
-        spin_unlock(&cache->object_list_lock);
-        fscache_stat(&fscache_n_cop_drop_object);
+        object->oob_event_mask = 0;
-        cache->ops->drop_object(object);
-        fscache_stat_d(&fscache_n_cop_drop_object);
-        if (parent) {
+        fscache_stat(&fscache_n_cop_lookup_complete);
-                _debug("release parent OBJ%x {%d}",
+        object->cache->ops->lookup_complete(object);
-                       parent->debug_id, parent->n_children);
+        fscache_stat_d(&fscache_n_cop_lookup_complete);
-                spin_lock(&parent->lock);
+        cookie = object->cookie;
-                parent->n_children--;
+        set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
-                if (parent->n_children == 0)
+        if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
-                        fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
-                spin_unlock(&parent->lock);
-                object->parent = NULL;
+        fscache_done_parent_op(object);
+        return transit_to(KILL_OBJECT);
+}
+/*
+ * Wait for completion of all active operations on this object and the death of
+ * all child objects of this object.
+ */
+static const struct fscache_state *fscache_kill_object(struct fscache_object *object,
+                                                       int event)
+{
+        _enter("{OBJ%x,%d,%d},%d",
+               object->debug_id, object->n_ops, object->n_children, event);
+        clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+        object->oob_event_mask = 0;
+        if (list_empty(&object->dependents) &&
+            object->n_ops == 0 &&
+            object->n_children == 0)
+                return transit_to(DROP_OBJECT);
+        if (object->n_in_progress == 0) {
+                spin_lock(&object->lock);
+                if (object->n_ops > 0 && object->n_in_progress == 0)
+                        fscache_start_operations(object);
+                spin_unlock(&object->lock);
        }
-        /* this just shifts the object release to the work processor */
+        if (!list_empty(&object->dependents))
-        fscache_put_object(object);
+                return transit_to(KILL_DEPENDENTS);
-        _leave("");
+        return transit_to(WAIT_FOR_CLEARANCE);
 }
 /*
- * release or recycle an object that the netfs has discarded
+ * Kill dependent objects.
 */
-static void fscache_release_object(struct fscache_object *object)
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object,
+                                                           int event)
 {
-        _enter("");
+        _enter("{OBJ%x},%d", object->debug_id, event);
-        fscache_drop_object(object);
+        if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL))
+                return NO_TRANSIT; /* Not finished */
+        return transit_to(WAIT_FOR_CLEARANCE);
 }
 /*
- * withdraw an object from active service
+ * Drop an object's attachments
 */
-static void fscache_withdraw_object(struct fscache_object *object)
+static const struct fscache_state *fscache_drop_object(struct fscache_object *object,
+                                                       int event)
 {
-        struct fscache_cookie *cookie;
+        struct fscache_object *parent = object->parent;
-        bool detached;
+        struct fscache_cookie *cookie = object->cookie;
+        struct fscache_cache *cache = object->cache;
+        bool awaken = false;
-        _enter("");
+        _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event);
-        spin_lock(&object->lock);
+        ASSERT(cookie != NULL);
-        cookie = object->cookie;
+        ASSERT(!hlist_unhashed(&object->cookie_link));
-        if (cookie) {
-                /* need to get the cookie lock before the object lock, starting
-                 * from the object pointer */
-                atomic_inc(&cookie->usage);
-                spin_unlock(&object->lock);
-                detached = false;
+        /* Make sure the cookie no longer points here and that the netfs isn't
-                spin_lock(&cookie->lock);
+         * waiting for us.
-                spin_lock(&object->lock);
+         */
+        spin_lock(&cookie->lock);
+        hlist_del_init(&object->cookie_link);
+        if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+                awaken = true;
+        spin_unlock(&cookie->lock);
-                if (object->cookie == cookie) {
+        if (awaken)
-                        hlist_del_init(&object->cookie_link);
+                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
-                        object->cookie = NULL;
-                        fscache_invalidation_complete(cookie);
-                        detached = true;
-                }
-                spin_unlock(&cookie->lock);
-                fscache_cookie_put(cookie);
-                if (detached)
-                        fscache_cookie_put(cookie);
-        }
+        /* Prevent a race with our last child, which has to signal EV_CLEARED
+         * before dropping our spinlock.
+         */
+        spin_lock(&object->lock);
        spin_unlock(&object->lock);
-        fscache_drop_object(object);
+        /* Discard from the cache's collection of objects */
-}
+        spin_lock(&cache->object_list_lock);
+        list_del_init(&object->cache_link);
+        spin_unlock(&cache->object_list_lock);
-/*
+        fscache_stat(&fscache_n_cop_drop_object);
- * withdraw an object from active service at the behest of the cache
+        cache->ops->drop_object(object);
- * - need break the links to a cached object cookie
+        fscache_stat_d(&fscache_n_cop_drop_object);
- * - called under two situations:
- *   (1) recycler decides to reclaim an in-use object
- *   (2) a cache is unmounted
- * - have to take care as the cookie can be being relinquished by the netfs
- *   simultaneously
- * - the object is pinned by the caller holding a refcount on it
- */
-void fscache_withdrawing_object(struct fscache_cache *cache,
-                                struct fscache_object *object)
-{
-        bool enqueue = false;
-        _enter(",OBJ%x", object->debug_id);
+        /* The parent object wants to know when all it dependents have gone */
+        if (parent) {
+                _debug("release parent OBJ%x {%d}",
+                       parent->debug_id, parent->n_children);
-        spin_lock(&object->lock);
+                spin_lock(&parent->lock);
-        if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
+                parent->n_children--;
-                object->state = FSCACHE_OBJECT_WITHDRAWING;
+                if (parent->n_children == 0)
-                enqueue = true;
+                        fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+                spin_unlock(&parent->lock);
+                object->parent = NULL;
        }
-        spin_unlock(&object->lock);
-        if (enqueue)
+        /* this just shifts the object release to the work processor */
-                fscache_enqueue_object(object);
+        fscache_put_object(object);
+        fscache_stat(&fscache_n_object_dead);
        _leave("");
+        return transit_to(OBJECT_DEAD);
 }
 /*
@@ -771,7 +736,7 @@ static int fscache_get_object(struct fscache_object *object)
 }
 /*
- * discard a ref on a work item
+ * Discard a ref on an object
 */
 static void fscache_put_object(struct fscache_object *object)
 {
@@ -780,6 +745,22 @@ static void fscache_put_object(struct fscache_object *object)
        fscache_stat_d(&fscache_n_cop_put_object);
 }
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *object)
+{
+        fscache_objlist_remove(object);
+        /* We can get rid of the cookie now */
+        fscache_cookie_put(object->cookie);
+        object->cookie = NULL;
+}
+EXPORT_SYMBOL(fscache_object_destroy);
 /*
 * enqueue an object for metadata-type processing
 */
@@ -803,7 +784,7 @@ void fscache_enqueue_object(struct fscache_object *object)
 /**
 * fscache_object_sleep_till_congested - Sleep until object wq is congested
- * @timoutp: Scheduler sleep timeout
+ * @timeoutp: Scheduler sleep timeout
 *
 * Allow an object handler to sleep until the object workqueue is congested.
 *
@@ -831,18 +812,21 @@ bool fscache_object_sleep_till_congested(signed long *timeoutp)
 EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
 /*
- * enqueue the dependents of an object for metadata-type processing
+ * Enqueue the dependents of an object for metadata-type processing.
- * - the caller must hold the object's lock
+ *
- * - this may cause an already locked object to wind up being processed again
+ * If we don't manage to finish the list before the scheduler wants to run
+ * again then return false immediately.  We return true if the list was
+ * cleared.
 */
-static void fscache_enqueue_dependents(struct fscache_object *object)
+static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
 {
        struct fscache_object *dep;
+        bool ret = true;
        _enter("{OBJ%x}", object->debug_id);
        if (list_empty(&object->dependents))
-                return;
+                return true;
        spin_lock(&object->lock);
@@ -851,23 +835,23 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
                                 struct fscache_object, dep_link);
                list_del_init(&dep->dep_link);
+                fscache_raise_event(dep, event);
-                /* sort onto appropriate lists */
-                fscache_enqueue_object(dep);
                fscache_put_object(dep);
-                if (!list_empty(&object->dependents))
+                if (!list_empty(&object->dependents) && need_resched()) {
-                        cond_resched_lock(&object->lock);
+                        ret = false;
+                        break;
+                }
        }
        spin_unlock(&object->lock);
+        return ret;
 }
 /*
 * remove an object from whatever queue it's waiting on
- * - the caller must hold object->lock
 */
-void fscache_dequeue_object(struct fscache_object *object)
+static void fscache_dequeue_object(struct fscache_object *object)
 {
        _enter("{OBJ%x}", object->debug_id);
@@ -886,7 +870,10 @@ void fscache_dequeue_object(struct fscache_object *object)
 * @data: The auxiliary data for the object
 * @datalen: The size of the auxiliary data
 *
- * This function consults the netfs about the coherency state of an object
+ * This function consults the netfs about the coherency state of an object.
+ * The caller must be holding a ref on cookie->n_active (held by
+ * fscache_look_up_object() on behalf of the cache backend during object lookup
+ * and creation).
 */
 enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
                                        const void *data, uint16_t datalen)
@@ -927,12 +914,23 @@ EXPORT_SYMBOL(fscache_check_aux);
 /*
 * Asynchronously invalidate an object.
 */
-static void fscache_invalidate_object(struct fscache_object *object)
+static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object,
+                                                              int event)
 {
        struct fscache_operation *op;
        struct fscache_cookie *cookie = object->cookie;
-        _enter("{OBJ%x}", object->debug_id);
+        _enter("{OBJ%x},%d", object->debug_id, event);
+        /* We're going to need the cookie.  If the cookie is not available then
+         * retire the object instead.
+         */
+        if (!fscache_use_cookie(object)) {
+                ASSERT(object->cookie->stores.rnode == NULL);
+                set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+                _leave(" [no cookie]");
+                return transit_to(KILL_OBJECT);
+        }
        /* Reject any new read/write ops and abort any that are pending. */
        fscache_invalidate_writes(cookie);
@@ -941,14 +939,13 @@ static void fscache_invalidate_object(struct fscache_object *object)
        /* Now we have to wait for in-progress reads and writes */
        op = kzalloc(sizeof(*op), GFP_KERNEL);
-        if (!op) {
+        if (!op)
-                fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+                goto nomem;
-                _leave(" [ENOMEM]");
-                return;
-        }
        fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
-        op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+        op->flags = FSCACHE_OP_ASYNC |
+                (1 << FSCACHE_OP_EXCLUSIVE) |
+                (1 << FSCACHE_OP_UNUSE_COOKIE);
        spin_lock(&cookie->lock);
        if (fscache_submit_exclusive_op(object, op) < 0)
@@ -965,13 +962,50 @@ static void fscache_invalidate_object(struct fscache_object *object)
        /* We can allow read and write requests to come in once again.  They'll
         * queue up behind our exclusive invalidation operation.
         */
-        fscache_invalidation_complete(cookie);
+        if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
-        _leave("");
+                wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
-        return;
+        _leave(" [ok]");
+        return transit_to(UPDATE_OBJECT);
+nomem:
+        clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+        fscache_unuse_cookie(object);
+        _leave(" [ENOMEM]");
+        return transit_to(KILL_OBJECT);
 submit_op_failed:
+        clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
        spin_unlock(&cookie->lock);
        kfree(op);
-        fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
        _leave(" [EIO]");
+        return transit_to(KILL_OBJECT);
+}
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object,
+                                                             int event)
+{
+        const struct fscache_state *s;
+        fscache_stat(&fscache_n_invalidates_run);
+        fscache_stat(&fscache_n_cop_invalidate_object);
+        s = _fscache_invalidate_object(object, event);
+        fscache_stat_d(&fscache_n_cop_invalidate_object);
+        return s;
+}
+/*
+ * Asynchronously update an object.
+ */
+static const struct fscache_state *fscache_update_object(struct fscache_object *object,
+                                                         int event)
+{
+        _enter("{OBJ%x},%d", object->debug_id, event);
+        fscache_stat(&fscache_n_updates_run);
+        fscache_stat(&fscache_n_cop_update_object);
+        object->cache->ops->update_object(object);
+        fscache_stat_d(&fscache_n_cop_update_object);
+        _leave("");
+        return transit_to(WAIT_FOR_CMD);
 }
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 762a9ec4ffa4..318071aca217 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -35,7 +35,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
        ASSERT(list_empty(&op->pend_link));
        ASSERT(op->processor != NULL);
-        ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+        ASSERT(fscache_object_is_available(op->object));
        ASSERTCMP(atomic_read(&op->usage), >, 0);
        ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
@@ -119,7 +119,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
                /* need to issue a new write op after this */
                clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
                ret = 0;
-        } else if (object->state == FSCACHE_OBJECT_CREATING) {
+        } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
                op->object = object;
                object->n_ops++;
                object->n_exclusive++;  /* reads and writes must wait */
@@ -144,7 +144,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 */
 static void fscache_report_unexpected_submission(struct fscache_object *object,
                                                 struct fscache_operation *op,
-                                                 unsigned long ostate)
+                                                 const struct fscache_state *ostate)
 {
        static bool once_only;
        struct fscache_operation *p;
@@ -155,11 +155,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
        once_only = true;
        kdebug("unexpected submission OP%x [OBJ%x %s]",
-               op->debug_id, object->debug_id,
+               op->debug_id, object->debug_id, object->state->name);
-               fscache_object_states[object->state]);
+        kdebug("objstate=%s [%s]", object->state->name, ostate->name);
-        kdebug("objstate=%s [%s]",
-               fscache_object_states[object->state],
-               fscache_object_states[ostate]);
        kdebug("objflags=%lx", object->flags);
        kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
        kdebug("ops=%u inp=%u exc=%u",
@@ -190,7 +187,7 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
 int fscache_submit_op(struct fscache_object *object,
                      struct fscache_operation *op)
 {
-        unsigned long ostate;
+        const struct fscache_state *ostate;
        int ret;
        _enter("{OBJ%x OP%x},{%u}",
@@ -226,16 +223,14 @@ int fscache_submit_op(struct fscache_object *object,
                        fscache_run_op(object, op);
                }
                ret = 0;
-        } else if (object->state == FSCACHE_OBJECT_CREATING) {
+        } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
                op->object = object;
                object->n_ops++;
                atomic_inc(&op->usage);
                list_add_tail(&op->pend_link, &object->pending_ops);
                fscache_stat(&fscache_n_op_pend);
                ret = 0;
-        } else if (object->state == FSCACHE_OBJECT_DYING ||
+        } else if (fscache_object_is_dying(object)) {
-                   object->state == FSCACHE_OBJECT_LC_DYING ||
-                   object->state == FSCACHE_OBJECT_WITHDRAWING) {
                fscache_stat(&fscache_n_op_rejected);
                op->state = FSCACHE_OP_ST_CANCELLED;
                ret = -ENOBUFS;
@@ -265,8 +260,8 @@ void fscache_abort_object(struct fscache_object *object)
 }
 /*
- * jump start the operation processing on an object
+ * Jump start the operation processing on an object.  The caller must hold
- * - caller must hold object->lock
+ * object->lock.
 */
 void fscache_start_operations(struct fscache_object *object)
 {
@@ -428,14 +423,10 @@ void fscache_put_operation(struct fscache_operation *op)
        object = op->object;
-        if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) {
+        if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
-                if (atomic_dec_and_test(&object->n_reads)) {
+                atomic_dec(&object->n_reads);
-                        clear_bit(FSCACHE_COOKIE_WAITING_ON_READS,
+        if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
-                                  &object->cookie->flags);
+                fscache_unuse_cookie(object);
-                        wake_up_bit(&object->cookie->flags,
-                                    FSCACHE_COOKIE_WAITING_ON_READS);
-                }
-        }
        /* now... we may get called with the object spinlock held, so we
         * complete the cleanup here only if we can immediately acquire the
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ff000e52072d..d479ab3c63e4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -109,7 +109,7 @@ page_busy:
         * allocator as the work threads writing to the cache may all end up
         * sleeping on memory allocation, so we may need to impose a timeout
         * too. */
-        if (!(gfp & __GFP_WAIT)) {
+        if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
                fscache_stat(&fscache_n_store_vmscan_busy);
                return false;
        }
@@ -163,10 +163,12 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
        fscache_stat(&fscache_n_attr_changed_calls);
-        if (fscache_object_is_active(object)) {
+        if (fscache_object_is_active(object) &&
+            fscache_use_cookie(object)) {
                fscache_stat(&fscache_n_cop_attr_changed);
                ret = object->cache->ops->attr_changed(object);
                fscache_stat_d(&fscache_n_cop_attr_changed);
+                fscache_unuse_cookie(object);
                if (ret < 0)
                        fscache_abort_object(object);
        }
@@ -233,7 +235,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
        _enter("{OP%x}", op->op.debug_id);
-        ASSERTCMP(op->n_pages, ==, 0);
+        ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
        fscache_hist(fscache_retrieval_histogram, op->start_time);
        if (op->context)
@@ -246,6 +248,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
 * allocate a retrieval op
 */
 static struct fscache_retrieval *fscache_alloc_retrieval(
+        struct fscache_cookie *cookie,
        struct address_space *mapping,
        fscache_rw_complete_t end_io_func,
        void *context)
@@ -260,7 +263,10 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
        }
        fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
-        op->op.flags    = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+        atomic_inc(&cookie->n_active);
+        op->op.flags    = FSCACHE_OP_MYTHREAD |
+                (1UL << FSCACHE_OP_WAITING) |
+                (1UL << FSCACHE_OP_UNUSE_COOKIE);
        op->mapping     = mapping;
        op->end_io_func = end_io_func;
        op->context     = context;
@@ -310,7 +316,7 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
        struct fscache_retrieval *op =
                container_of(_op, struct fscache_retrieval, op);
-        op->n_pages = 0;
+        atomic_set(&op->n_pages, 0);
 }
 /*
@@ -394,12 +400,13 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
        if (fscache_wait_for_deferred_lookup(cookie) < 0)
                return -ERESTARTSYS;
-        op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+        op = fscache_alloc_retrieval(cookie, page->mapping,
+                                     end_io_func,context);
        if (!op) {
                _leave(" = -ENOMEM");
                return -ENOMEM;
        }
-        op->n_pages = 1;
+        atomic_set(&op->n_pages, 1);
        spin_lock(&cookie->lock);
@@ -408,7 +415,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
        object = hlist_entry(cookie->backing_objects.first,
                             struct fscache_object, cookie_link);
-        ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+        ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
        atomic_inc(&object->n_reads);
        __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
@@ -465,6 +472,7 @@ nobufs_unlock_dec:
        atomic_dec(&object->n_reads);
 nobufs_unlock:
        spin_unlock(&cookie->lock);
+        atomic_dec(&cookie->n_active);
        kfree(op);
 nobufs:
        fscache_stat(&fscache_n_retrievals_nobufs);
@@ -522,10 +530,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
        if (fscache_wait_for_deferred_lookup(cookie) < 0)
                return -ERESTARTSYS;
-        op = fscache_alloc_retrieval(mapping, end_io_func, context);
+        op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context);
        if (!op)
                return -ENOMEM;
-        op->n_pages = *nr_pages;
+        atomic_set(&op->n_pages, *nr_pages);
        spin_lock(&cookie->lock);
@@ -589,6 +597,7 @@ nobufs_unlock_dec:
        atomic_dec(&object->n_reads);
 nobufs_unlock:
        spin_unlock(&cookie->lock);
+        atomic_dec(&cookie->n_active);
        kfree(op);
 nobufs:
        fscache_stat(&fscache_n_retrievals_nobufs);
@@ -631,10 +640,10 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
        if (fscache_wait_for_deferred_lookup(cookie) < 0)
                return -ERESTARTSYS;
-        op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+        op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL);
        if (!op)
                return -ENOMEM;
-        op->n_pages = 1;
+        atomic_set(&op->n_pages, 1);
        spin_lock(&cookie->lock);
@@ -675,6 +684,7 @@ error:
 nobufs_unlock:
        spin_unlock(&cookie->lock);
+        atomic_dec(&cookie->n_active);
        kfree(op);
 nobufs:
        fscache_stat(&fscache_n_allocs_nobufs);
@@ -729,8 +739,9 @@ static void fscache_write_op(struct fscache_operation *_op)
                 */
                spin_unlock(&object->lock);
                fscache_op_complete(&op->op, false);
-                _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}",
+                _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
-                       _op->flags, _op->state, object->state, object->flags);
+                       _op->flags, _op->state, object->state->short_name,
+                       object->flags);
                return;
        }
@@ -796,11 +807,16 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
        _enter("");
-        while (spin_lock(&cookie->stores_lock),
+        for (;;) {
-               n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+                spin_lock(&cookie->stores_lock);
-                                              ARRAY_SIZE(results),
+                n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
-                                              FSCACHE_COOKIE_PENDING_TAG),
+                                               ARRAY_SIZE(results),
-               n > 0) {
+                                               FSCACHE_COOKIE_PENDING_TAG);
+                if (n == 0) {
+                        spin_unlock(&cookie->stores_lock);
+                        break;
+                }
                for (i = n - 1; i >= 0; i--) {
                        page = results[i];
                        radix_tree_delete(&cookie->stores, page->index);
@@ -812,7 +828,6 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
                        page_cache_release(results[i]);
        }
-        spin_unlock(&cookie->stores_lock);
        _leave("");
 }
@@ -829,14 +844,12 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
 *  (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
 *      set)
 *
- *      (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
+ *      (a) no writes yet
- *          fill op)
 *
 *      (b) writes deferred till post-creation (mark page for writing and
 *          return immediately)
 *
 *  (2) negative lookup, object created, initial fill being made from netfs
- *      (FSCACHE_COOKIE_INITIAL_FILL is set)
 *
 *      (a) fill point not yet reached this page (mark page for writing and
 *          return)
@@ -873,7 +886,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        fscache_operation_init(&op->op, fscache_write_op,
                               fscache_release_write_op);
-        op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
+        op->op.flags = FSCACHE_OP_ASYNC |
+                (1 << FSCACHE_OP_WAITING) |
+                (1 << FSCACHE_OP_UNUSE_COOKIE);
        ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
        if (ret < 0)
@@ -919,6 +934,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
        op->store_limit = object->store_limit;
+        atomic_inc(&cookie->n_active);
        if (fscache_submit_op(object, &op->op) < 0)
                goto submit_failed;
@@ -945,6 +961,7 @@ already_pending:
        return 0;
 submit_failed:
+        atomic_dec(&cookie->n_active);
        spin_lock(&cookie->stores_lock);
        radix_tree_delete(&cookie->stores, page->index);
        spin_unlock(&cookie->stores_lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f3f783dc4f75..72a5d5b04494 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,7 +14,7 @@
 #include <linux/namei.h>
 #include <linux/slab.h>
-static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
+static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
 {
        struct fuse_conn *fc = get_fuse_conn(dir);
        struct fuse_inode *fi = get_fuse_inode(dir);
@@ -25,7 +25,7 @@ static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
                return true;
        if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
                return true;
-        if (filp->f_pos == 0)
+        if (ctx->pos == 0)
                return true;
        return false;
 }
@@ -1165,25 +1165,23 @@ static int fuse_permission(struct inode *inode, int mask)
 }
 static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
-                         void *dstbuf, filldir_t filldir)
+                         struct dir_context *ctx)
 {
        while (nbytes >= FUSE_NAME_OFFSET) {
                struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
                size_t reclen = FUSE_DIRENT_SIZE(dirent);
-                int over;
                if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
                        return -EIO;
                if (reclen > nbytes)
                        break;
-                over = filldir(dstbuf, dirent->name, dirent->namelen,
+                if (!dir_emit(ctx, dirent->name, dirent->namelen,
-                               file->f_pos, dirent->ino, dirent->type);
+                               dirent->ino, dirent->type))
-                if (over)
                        break;
                buf += reclen;
                nbytes -= reclen;
-                file->f_pos = dirent->off;
+                ctx->pos = dirent->off;
        }
        return 0;
@@ -1225,30 +1223,46 @@ static int fuse_direntplus_link(struct file *file,
                if (name.name[1] == '.' && name.len == 2)
                        return 0;
        }
+        if (invalid_nodeid(o->nodeid))
+                return -EIO;
+        if (!fuse_valid_type(o->attr.mode))
+                return -EIO;
        fc = get_fuse_conn(dir);
        name.hash = full_name_hash(name.name, name.len);
        dentry = d_lookup(parent, &name);
-        if (dentry && dentry->d_inode) {
+        if (dentry) {
                inode = dentry->d_inode;
-                if (get_node_id(inode) == o->nodeid) {
+                if (!inode) {
+                        d_drop(dentry);
+                } else if (get_node_id(inode) != o->nodeid ||
+                           ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+                        err = d_invalidate(dentry);
+                        if (err)
+                                goto out;
+                } else if (is_bad_inode(inode)) {
+                        err = -EIO;
+                        goto out;
+                } else {
                        struct fuse_inode *fi;
                        fi = get_fuse_inode(inode);
                        spin_lock(&fc->lock);
                        fi->nlookup++;
                        spin_unlock(&fc->lock);
+                        fuse_change_attributes(inode, &o->attr,
+                                               entry_attr_timeout(o),
+                                               attr_version);
                        /*
                         * The other branch to 'found' comes via fuse_iget()
                         * which bumps nlookup inside
                         */
                        goto found;
                }
-                err = d_invalidate(dentry);
-                if (err)
-                        goto out;
                dput(dentry);
-                dentry = NULL;
        }
        dentry = d_alloc(parent, &name);
@@ -1261,30 +1275,35 @@ static int fuse_direntplus_link(struct file *file,
        if (!inode)
                goto out;
-        alias = d_materialise_unique(dentry, inode);
+        if (S_ISDIR(inode->i_mode)) {
-        err = PTR_ERR(alias);
+                mutex_lock(&fc->inst_mutex);
-        if (IS_ERR(alias))
+                alias = fuse_d_add_directory(dentry, inode);
-                goto out;
+                mutex_unlock(&fc->inst_mutex);
+                err = PTR_ERR(alias);
+                if (IS_ERR(alias)) {
+                        iput(inode);
+                        goto out;
+                }
+        } else {
+                alias = d_splice_alias(inode, dentry);
+        }
        if (alias) {
                dput(dentry);
                dentry = alias;
        }
 found:
-        fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o),
-                               attr_version);
        fuse_change_entry_timeout(dentry, o);
        err = 0;
 out:
-        if (dentry)
+        dput(dentry);
-                dput(dentry);
        return err;
 }
 static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
-                             void *dstbuf, filldir_t filldir, u64 attr_version)
+                             struct dir_context *ctx, u64 attr_version)
 {
        struct fuse_direntplus *direntplus;
        struct fuse_dirent *dirent;
@@ -1309,10 +1328,9 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
                           we need to send a FORGET for each of those
                           which we did not link.
                        */
-                        over = filldir(dstbuf, dirent->name, dirent->namelen,
+                        over = !dir_emit(ctx, dirent->name, dirent->namelen,
-                                       file->f_pos, dirent->ino,
+                                       dirent->ino, dirent->type);
-                                       dirent->type);
+                        ctx->pos = dirent->off;
-                        file->f_pos = dirent->off;
                }
                buf += reclen;
@@ -1326,7 +1344,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
        return 0;
 }
-static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+static int fuse_readdir(struct file *file, struct dir_context *ctx)
 {
        int plus, err;
        size_t nbytes;
@@ -1349,17 +1367,17 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
                return -ENOMEM;
        }
-        plus = fuse_use_readdirplus(inode, file);
+        plus = fuse_use_readdirplus(inode, ctx);
        req->out.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
        req->page_descs[0].length = PAGE_SIZE;
        if (plus) {
                attr_version = fuse_get_attr_version(fc);
-                fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+                fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
                               FUSE_READDIRPLUS);
        } else {
-                fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+                fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
                               FUSE_READDIR);
        }
        fuse_request_send(fc, req);
@@ -1369,11 +1387,11 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
        if (!err) {
                if (plus) {
                        err = parse_dirplusfile(page_address(page), nbytes,
-                                                file, dstbuf, filldir,
+                                                file, ctx,
                                                attr_version);
                } else {
                        err = parse_dirfile(page_address(page), nbytes, file,
-                                            dstbuf, filldir);
+                                            ctx);
                }
        }
@@ -1886,7 +1904,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
 static const struct file_operations fuse_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = fuse_readdir,
+        .iterate        = fuse_readdir,
        .open           = fuse_dir_open,
        .release        = fuse_dir_release,
        .fsync          = fuse_dir_fsync,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 35f281033142..5c121fe19c5f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -548,8 +548,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
                        res = io->bytes < 0 ? io->size : io->bytes;
                        if (!is_sync_kiocb(io->iocb)) {
-                                struct path *path = &io->iocb->ki_filp->f_path;
+                                struct inode *inode = file_inode(io->iocb->ki_filp);
-                                struct inode *inode = path->dentry->d_inode;
                                struct fuse_conn *fc = get_fuse_conn(inode);
                                struct fuse_inode *fi = get_fuse_inode(inode);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9a0cdde14a08..0b578598c6ac 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -785,7 +785,7 @@ static const struct super_operations fuse_super_operations = {
 static void sanitize_global_limit(unsigned *limit)
 {
        if (*limit == 0)
-                *limit = ((num_physpages << PAGE_SHIFT) >> 13) /
+                *limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
                         sizeof(struct fuse_req);
        if (*limit >= 1 << 16)
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 5a376ab81feb..90c6a8faaecb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -20,13 +20,12 @@ config GFS2_FS
          be found here: http://sources.redhat.com/cluster
          The "nolock" lock module is now built in to GFS2 by default. If
-          you want to use the DLM, be sure to enable HOTPLUG and IPv4/6
+          you want to use the DLM, be sure to enable IPv4/6 networking.
-          networking.
 config GFS2_FS_LOCKING_DLM
        bool "GFS2 DLM locking"
        depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \
-                HOTPLUG && CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
+                CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
        help
          Multiple node locking module for GFS2
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0bad69ed6336..ee48ad37d9c0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -110,7 +110,7 @@ static int gfs2_writepage_common(struct page *page,
        /* Is the page fully outside i_size? (truncate in progress) */
        offset = i_size & (PAGE_CACHE_SIZE-1);
        if (page->index > end_index || (page->index == end_index && !offset)) {
-                page->mapping->a_ops->invalidatepage(page, 0);
+                page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
                goto out;
        }
        return 1;
@@ -299,7 +299,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
                /* Is the page fully outside i_size? (truncate in progress) */
                if (page->index > end_index || (page->index == end_index && !offset)) {
-                        page->mapping->a_ops->invalidatepage(page, 0);
+                        page->mapping->a_ops->invalidatepage(page, 0,
+                                                             PAGE_CACHE_SIZE);
                        unlock_page(page);
                        continue;
                }
@@ -943,27 +944,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
        unlock_buffer(bh);
 }
-static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+static void gfs2_invalidatepage(struct page *page, unsigned int offset,
+                                unsigned int length)
 {
        struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+        unsigned int stop = offset + length;
+        int partial_page = (offset || length < PAGE_CACHE_SIZE);
        struct buffer_head *bh, *head;
        unsigned long pos = 0;
        BUG_ON(!PageLocked(page));
-        if (offset == 0)
+        if (!partial_page)
                ClearPageChecked(page);
        if (!page_has_buffers(page))
                goto out;
        bh = head = page_buffers(page);
        do {
+                if (pos + bh->b_size > stop)
+                        return;
                if (offset <= pos)
                        gfs2_discard(sdp, bh);
                pos += bh->b_size;
                bh = bh->b_this_page;
        } while (bh != head);
 out:
-        if (offset == 0)
+        if (!partial_page)
                try_to_release_page(page, 0);
 }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 93b5809c20bb..5e2f56fccf6b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1232,7 +1232,9 @@ static int do_grow(struct inode *inode, u64 size)
                unstuff = 1;
        }
-        error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
+        error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
+                                 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
+                                  0 : RES_QUOTA), 0);
        if (error)
                goto do_grow_release;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4fddb3c22d25..f2448ab2aac5 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -109,8 +109,7 @@ fail:
        return 0;
 }
-static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode,
+static int gfs2_dhash(const struct dentry *dentry, struct qstr *str)
-                struct qstr *str)
 {
        str->hash = gfs2_disk_hash(str->name, str->len);
        return 0;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b631c9043460..0cb4c1557f20 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1125,13 +1125,14 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        if (IS_ERR(hc))
                return PTR_ERR(hc);
-        h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
+        hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
        if (hc2 == NULL)
                hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL);
        if (!hc2)
                return -ENOMEM;
+        h = hc2;
        error = gfs2_meta_inode_buffer(dip, &dibh);
        if (error)
                goto out_kfree;
@@ -1212,9 +1213,7 @@ static int compare_dents(const void *a, const void *b)
 /**
 * do_filldir_main - read out directory entries
 * @dip: The GFS2 inode
- * @offset: The offset in the file to read from
+ * @ctx: what to feed the entries to
- * @opaque: opaque data to pass to filldir
- * @filldir: The function to pass entries to
 * @darr: an array of struct gfs2_dirent pointers to read
 * @entries: the number of entries in darr
 * @copied: pointer to int that's non-zero if a entry has been copied out
@@ -1224,11 +1223,10 @@ static int compare_dents(const void *a, const void *b)
 * the possibility that they will fall into different readdir buffers or
 * that someone will want to seek to that location.
 *
- * Returns: errno, >0 on exception from filldir
+ * Returns: errno, >0 if the actor tells you to stop
 */
-static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
+static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
-                           void *opaque, filldir_t filldir,
                           const struct gfs2_dirent **darr, u32 entries,
                           int *copied)
 {
@@ -1236,7 +1234,6 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
        u64 off, off_next;
        unsigned int x, y;
        int run = 0;
-        int error = 0;
        sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
@@ -1253,9 +1250,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
                        off_next = be32_to_cpu(dent_next->de_hash);
                        off_next = gfs2_disk_hash2offset(off_next);
-                        if (off < *offset)
+                        if (off < ctx->pos)
                                continue;
-                        *offset = off;
+                        ctx->pos = off;
                        if (off_next == off) {
                                if (*copied && !run)
@@ -1264,26 +1261,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
                        } else
                                run = 0;
                } else {
-                        if (off < *offset)
+                        if (off < ctx->pos)
                                continue;
-                        *offset = off;
+                        ctx->pos = off;
                }
-                error = filldir(opaque, (const char *)(dent + 1),
+                if (!dir_emit(ctx, (const char *)(dent + 1),
                                be16_to_cpu(dent->de_name_len),
-                                off, be64_to_cpu(dent->de_inum.no_addr),
+                                be64_to_cpu(dent->de_inum.no_addr),
-                                be16_to_cpu(dent->de_type));
+                                be16_to_cpu(dent->de_type)))
-                if (error)
                        return 1;
                *copied = 1;
        }
-        /* Increment the *offset by one, so the next time we come into the
+        /* Increment the ctx->pos by one, so the next time we come into the
           do_filldir fxn, we get the next entry instead of the last one in the
           current leaf */
-        (*offset)++;
+        ctx->pos++;
        return 0;
 }
@@ -1307,8 +1303,8 @@ static void gfs2_free_sort_buffer(void *ptr)
                kfree(ptr);
 }
-static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
+static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
-                              filldir_t filldir, int *copied, unsigned *depth,
+                              int *copied, unsigned *depth,
                              u64 leaf_no)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1386,8 +1382,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
        } while(lfn);
        BUG_ON(entries2 != entries);
-        error = do_filldir_main(ip, offset, opaque, filldir, darr,
+        error = do_filldir_main(ip, ctx, darr, entries, copied);
-                                entries, copied);
 out_free:
        for(i = 0; i < leaf; i++)
                brelse(larr[i]);
@@ -1446,15 +1441,13 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
 /**
 * dir_e_read - Reads the entries from a directory into a filldir buffer
 * @dip: dinode pointer
- * @offset: the hash of the last entry read shifted to the right once
+ * @ctx: actor to feed the entries to
- * @opaque: buffer for the filldir function to fill
- * @filldir: points to the filldir function to use
 *
 * Returns: errno
 */
-static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
+static int dir_e_read(struct inode *inode, struct dir_context *ctx,
-                      filldir_t filldir, struct file_ra_state *f_ra)
+                      struct file_ra_state *f_ra)
 {
        struct gfs2_inode *dip = GFS2_I(inode);
        u32 hsize, len = 0;
@@ -1465,7 +1458,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
        unsigned depth = 0;
        hsize = 1 << dip->i_depth;
-        hash = gfs2_dir_offset2hash(*offset);
+        hash = gfs2_dir_offset2hash(ctx->pos);
        index = hash >> (32 - dip->i_depth);
        if (dip->i_hash_cache == NULL)
@@ -1477,7 +1470,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
        gfs2_dir_readahead(inode, hsize, index, f_ra);
        while (index < hsize) {
-                error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
+                error = gfs2_dir_read_leaf(inode, ctx,
                                           &copied, &depth,
                                           be64_to_cpu(lp[index]));
                if (error)
@@ -1492,8 +1485,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
        return error;
 }
-int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
-                  filldir_t filldir, struct file_ra_state *f_ra)
+                  struct file_ra_state *f_ra)
 {
        struct gfs2_inode *dip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1507,7 +1500,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
                return 0;
        if (dip->i_diskflags & GFS2_DIF_EXHASH)
-                return dir_e_read(inode, offset, opaque, filldir, f_ra);
+                return dir_e_read(inode, ctx, f_ra);
        if (!gfs2_is_stuffed(dip)) {
                gfs2_consist_inode(dip);
@@ -1539,7 +1532,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
                        error = -EIO;
                        goto out;
                }
-                error = do_filldir_main(dip, offset, opaque, filldir, darr,
+                error = do_filldir_main(dip, ctx, darr,
                                        dip->i_entries, &copied);
 out:
                kfree(darr);
@@ -1555,9 +1548,9 @@ out:
 /**
 * gfs2_dir_search - Search a directory
- * @dip: The GFS2 inode
+ * @dip: The GFS2 dir inode
- * @filename:
+ * @name: The name we are looking up
- * @inode:
+ * @fail_on_exist: Fail if the name exists rather than looking it up
 *
 * This routine searches a directory for a file or another directory.
 * Assumes a glock is held on dip.
@@ -1565,22 +1558,25 @@ out:
 * Returns: errno
 */
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
+struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
+                              bool fail_on_exist)
 {
        struct buffer_head *bh;
        struct gfs2_dirent *dent;
-        struct inode *inode;
+        u64 addr, formal_ino;
+        u16 dtype;
        dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
        if (dent) {
                if (IS_ERR(dent))
                        return ERR_CAST(dent);
-                inode = gfs2_inode_lookup(dir->i_sb, 
+                dtype = be16_to_cpu(dent->de_type);
-                                be16_to_cpu(dent->de_type),
+                addr = be64_to_cpu(dent->de_inum.no_addr);
-                                be64_to_cpu(dent->de_inum.no_addr),
+                formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
-                                be64_to_cpu(dent->de_inum.no_formal_ino), 0);
                brelse(bh);
-                return inode;
+                if (fail_on_exist)
+                        return ERR_PTR(-EEXIST);
+                return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
        }
        return ERR_PTR(-ENOENT);
 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 98c960beab35..4f03bbd1873f 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -18,14 +18,15 @@ struct gfs2_inode;
 struct gfs2_inum;
 extern struct inode *gfs2_dir_search(struct inode *dir,
-                                     const struct qstr *filename);
+                                     const struct qstr *filename,
+                                     bool fail_on_exist);
 extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
                          const struct gfs2_inode *ip);
 extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
                        const struct gfs2_inode *ip);
 extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
-extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
-                         filldir_t filldir, struct file_ra_state *f_ra);
+                         struct file_ra_state *f_ra);
 extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
                          const struct gfs2_inode *nip, unsigned int new_type);
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9973df4ff565..8b9b3775e2e7 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -64,6 +64,7 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
 }
 struct get_name_filldir {
+        struct dir_context ctx;
        struct gfs2_inum_host inum;
        char *name;
 };
@@ -88,9 +89,11 @@ static int gfs2_get_name(struct dentry *parent, char *name,
        struct inode *dir = parent->d_inode;
        struct inode *inode = child->d_inode;
        struct gfs2_inode *dip, *ip;
-        struct get_name_filldir gnfd;
+        struct get_name_filldir gnfd = {
+                .ctx.actor = get_name_filldir,
+                .name = name
+        };
        struct gfs2_holder gh;
-        u64 offset = 0;
        int error;
        struct file_ra_state f_ra = { .start = 0 };
@@ -106,13 +109,12 @@ static int gfs2_get_name(struct dentry *parent, char *name,
        *name = 0;
        gnfd.inum.no_addr = ip->i_no_addr;
        gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
-        gnfd.name = name;
        error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
        if (error)
                return error;
-        error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra);
+        error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra);
        gfs2_glock_dq_uninit(&gh);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ad0dc38d87ab..72c3866a7320 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -82,35 +82,28 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
 }
 /**
- * gfs2_readdir - Read directory entries from a directory
+ * gfs2_readdir - Iterator for a directory
 * @file: The directory to read from
- * @dirent: Buffer for dirents
+ * @ctx: What to feed directory entries to
- * @filldir: Function used to do the copying
 *
 * Returns: errno
 */
-static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int gfs2_readdir(struct file *file, struct dir_context *ctx)
 {
        struct inode *dir = file->f_mapping->host;
        struct gfs2_inode *dip = GFS2_I(dir);
        struct gfs2_holder d_gh;
-        u64 offset = file->f_pos;
        int error;
-        gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+        error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
-        error = gfs2_glock_nq(&d_gh);
+        if (error)
-        if (error) {
-                gfs2_holder_uninit(&d_gh);
                return error;
-        }
-        error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);
+        error = gfs2_dir_read(dir, ctx, &file->f_ra);
        gfs2_glock_dq_uninit(&d_gh);
-        file->f_pos = offset;
        return error;
 }
@@ -538,21 +531,30 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 }
 /**
- * gfs2_open - open a file
+ * gfs2_open_common - This is common to open and atomic_open
- * @inode: the inode to open
+ * @inode: The inode being opened
- * @file: the struct file for this opening
+ * @file: The file being opened
 *
- * Returns: errno
+ * This maybe called under a glock or not depending upon how it has
+ * been called. We must always be called under a glock for regular
+ * files, however. For other file types, it does not matter whether
+ * we hold the glock or not.
+ *
+ * Returns: Error code or 0 for success
 */
-static int gfs2_open(struct inode *inode, struct file *file)
+int gfs2_open_common(struct inode *inode, struct file *file)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_holder i_gh;
        struct gfs2_file *fp;
-        int error;
+        int ret;
+        if (S_ISREG(inode->i_mode)) {
+                ret = generic_file_open(inode, file);
+                if (ret)
+                        return ret;
+        }
-        fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+        fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS);
        if (!fp)
                return -ENOMEM;
@@ -560,29 +562,43 @@ static int gfs2_open(struct inode *inode, struct file *file)
        gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
        file->private_data = fp;
+        return 0;
+}
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * After atomic_open, this function is only used for opening files
+ * which are already cached. We must still get the glock for regular
+ * files to ensure that we have the file size uptodate for the large
+ * file check which is in the common code. That is only an issue for
+ * regular files though.
+ *
+ * Returns: errno
+ */
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder i_gh;
+        int error;
+        bool need_unlock = false;
        if (S_ISREG(ip->i_inode.i_mode)) {
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
                                           &i_gh);
                if (error)
-                        goto fail;
+                        return error;
+                need_unlock = true;
+        }
-                if (!(file->f_flags & O_LARGEFILE) &&
+        error = gfs2_open_common(inode, file);
-                    i_size_read(inode) > MAX_NON_LFS) {
-                        error = -EOVERFLOW;
-                        goto fail_gunlock;
-                }
+        if (need_unlock)
                gfs2_glock_dq_uninit(&i_gh);
-        }
-        return 0;
-fail_gunlock:
-        gfs2_glock_dq_uninit(&i_gh);
-fail:
-        file->private_data = NULL;
-        kfree(fp);
        return error;
 }
@@ -896,7 +912,7 @@ out_uninit:
 * cluster; until we do, disable leases (by just returning -EINVAL),
 * unless the administrator has requested purely local locking.
 *
- * Locking: called under lock_flocks
+ * Locking: called under i_lock
 *
 * Returns: errno
 */
@@ -1048,7 +1064,7 @@ const struct file_operations gfs2_file_fops = {
 };
 const struct file_operations gfs2_dir_fops = {
-        .readdir        = gfs2_readdir,
+        .iterate        = gfs2_readdir,
        .unlocked_ioctl = gfs2_ioctl,
        .open           = gfs2_open,
        .release        = gfs2_release,
@@ -1078,7 +1094,7 @@ const struct file_operations gfs2_file_fops_nolock = {
 };
 const struct file_operations gfs2_dir_fops_nolock = {
-        .readdir        = gfs2_readdir,
+        .iterate        = gfs2_readdir,
        .unlocked_ioctl = gfs2_ioctl,
        .open           = gfs2_open,
        .release        = gfs2_release,
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c66e99c97571..5f2e5224c51c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -54,7 +54,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
        struct gfs2_bufdata *bd, *tmp;
        struct buffer_head *bh;
        const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
-        sector_t blocknr;
        gfs2_log_lock(sdp);
        spin_lock(&sdp->sd_ail_lock);
@@ -65,13 +64,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
                                continue;
                        gfs2_ail_error(gl, bh);
                }
-                blocknr = bh->b_blocknr;
-                bh->b_private = NULL;
-                gfs2_remove_from_ail(bd); /* drops ref on bh */
-                bd->bd_bh = NULL;
-                bd->bd_blkno = blocknr;
                gfs2_trans_add_revoke(sdp, bd);
        }
        GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 62b484e4a9e4..bbb2715171cd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -313,7 +313,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
                        goto out;
        }
-        inode = gfs2_dir_search(dir, name);
+        inode = gfs2_dir_search(dir, name, false);
        if (IS_ERR(inode))
                error = PTR_ERR(inode);
 out:
@@ -346,17 +346,6 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
        if (!dip->i_inode.i_nlink)
                return -ENOENT;
-        error = gfs2_dir_check(&dip->i_inode, name, NULL);
-        switch (error) {
-        case -ENOENT:
-                error = 0;
-                break;
-        case 0:
-                return -EEXIST;
-        default:
-                return error;
-        }
        if (dip->i_entries == (u32)-1)
                return -EFBIG;
        if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
@@ -546,6 +535,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
 * gfs2_create_inode - Create a new inode
 * @dir: The parent directory
 * @dentry: The new dentry
+ * @file: If non-NULL, the file which is being opened
 * @mode: The permissions on the new inode
 * @dev: For device nodes, this is the device number
 * @symname: For symlinks, this is the link destination
@@ -555,8 +545,9 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
 */
 static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
+                             struct file *file,
                             umode_t mode, dev_t dev, const char *symname,
-                             unsigned int size, int excl)
+                             unsigned int size, int excl, int *opened)
 {
        const struct qstr *name = &dentry->d_name;
        struct gfs2_holder ghs[2];
@@ -564,6 +555,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        struct gfs2_inode *dip = GFS2_I(dir), *ip;
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct gfs2_glock *io_gl;
+        struct dentry *d;
        int error;
        u32 aflags = 0;
        int arq;
@@ -584,15 +576,30 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
                goto fail;
        error = create_ok(dip, name, mode);
-        if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
-                inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-                gfs2_glock_dq_uninit(ghs);
-                d_instantiate(dentry, inode);
-                return IS_ERR(inode) ? PTR_ERR(inode) : 0;
-        }
        if (error)
                goto fail_gunlock;
+        inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
+        error = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                d = d_splice_alias(inode, dentry);
+                error = 0;
+                if (file && !IS_ERR(d)) {
+                        if (d == NULL)
+                                d = dentry;
+                        if (S_ISREG(inode->i_mode))
+                                error = finish_open(file, d, gfs2_open_common, opened);
+                        else
+                                error = finish_no_open(file, d);
+                }
+                gfs2_glock_dq_uninit(ghs);
+                if (IS_ERR(d))
+                        return PTR_RET(d);
+                return error;
+        } else if (error != -ENOENT) {
+                goto fail_gunlock;
+        }
        arq = error = gfs2_diradd_alloc_required(dir, name);
        if (error < 0)
                goto fail_gunlock;
@@ -686,10 +693,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
                goto fail_gunlock3;
        mark_inode_dirty(inode);
+        d_instantiate(dentry, inode);
+        if (file)
+                error = finish_open(file, dentry, gfs2_open_common, opened);
        gfs2_glock_dq_uninit(ghs);
        gfs2_glock_dq_uninit(ghs + 1);
-        d_instantiate(dentry, inode);
+        return error;
-        return 0;
 fail_gunlock3:
        gfs2_glock_dq_uninit(ghs + 1);
@@ -729,36 +738,56 @@ fail:
 static int gfs2_create(struct inode *dir, struct dentry *dentry,
                       umode_t mode, bool excl)
 {
-        return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
+        return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL);
 }
 /**
- * gfs2_lookup - Look up a filename in a directory and return its inode
+ * __gfs2_lookup - Look up a filename in a directory and return its inode
 * @dir: The directory inode
 * @dentry: The dentry of the new inode
- * @nd: passed from Linux VFS, ignored by us
+ * @file: File to be opened
+ * @opened: atomic_open flags
 *
- * Called by the VFS layer. Lock dir and call gfs2_lookupi()
 *
 * Returns: errno
 */
-static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
-                                  unsigned int flags)
+                                    struct file *file, int *opened)
 {
-        struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+        struct inode *inode;
-        if (inode && !IS_ERR(inode)) {
+        struct dentry *d;
-                struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+        struct gfs2_holder gh;
-                struct gfs2_holder gh;
+        struct gfs2_glock *gl;
-                int error;
+        int error;
-                error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
-                if (error) {
+        inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-                        iput(inode);
+        if (!inode)
-                        return ERR_PTR(error);
+                return NULL;
-                }
+        if (IS_ERR(inode))
-                gfs2_glock_dq_uninit(&gh);
+                return ERR_CAST(inode);
+        gl = GFS2_I(inode)->i_gl;
+        error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+        if (error) {
+                iput(inode);
+                return ERR_PTR(error);
        }
-        return d_splice_alias(inode, dentry);
+        d = d_splice_alias(inode, dentry);
+        if (file && S_ISREG(inode->i_mode))
+                error = finish_open(file, dentry, gfs2_open_common, opened);
+        gfs2_glock_dq_uninit(&gh);
+        if (error)
+                return ERR_PTR(error);
+        return d;
+}
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+                                  unsigned flags)
+{
+        return __gfs2_lookup(dir, dentry, NULL, NULL);
 }
 /**
@@ -1076,7 +1105,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
        if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
                return -ENAMETOOLONG;
-        return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
+        return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL);
 }
 /**
@@ -1092,7 +1121,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
        struct gfs2_sbd *sdp = GFS2_SB(dir);
        unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
-        return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, dsize, 0);
+        return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL);
 }
 /**
@@ -1107,7 +1136,43 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
                      dev_t dev)
 {
-        return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
+        return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL);
+}
+/**
+ * gfs2_atomic_open - Atomically open a file
+ * @dir: The directory
+ * @dentry: The proposed new entry
+ * @file: The proposed new struct file
+ * @flags: open flags
+ * @mode: File mode
+ * @opened: Flag to say whether the file has been opened or not
+ *
+ * Returns: error code or 0 for success
+ */
+static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
+                            struct file *file, unsigned flags,
+                            umode_t mode, int *opened)
+{
+        struct dentry *d;
+        bool excl = !!(flags & O_EXCL);
+        d = __gfs2_lookup(dir, dentry, file, opened);
+        if (IS_ERR(d))
+                return PTR_ERR(d);
+        if (d == NULL)
+                d = dentry;
+        if (d->d_inode) {
+                if (!(*opened & FILE_OPENED))
+                        return finish_no_open(file, d);
+                return 0;
+        }
+        if (!(flags & O_CREAT))
+                return -ENOENT;
+        return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened);
 }
 /*
@@ -1787,6 +1852,7 @@ const struct inode_operations gfs2_dir_iops = {
        .removexattr = gfs2_removexattr,
        .fiemap = gfs2_fiemap,
        .get_acl = gfs2_get_acl,
+        .atomic_open = gfs2_atomic_open,
 };
 const struct inode_operations gfs2_symlink_iops = {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c53c7477f6da..ba4d9492d422 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,6 +109,7 @@ extern int gfs2_permission(struct inode *inode, int mask);
 extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+extern int gfs2_open_common(struct inode *inode, struct file *file);
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b404f4853034..610613fb65b5 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -211,15 +211,16 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
 {
        struct gfs2_trans *tr, *s;
+        int oldest_tr = 1;
        int ret;
        spin_lock(&sdp->sd_ail_lock);
        list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
                gfs2_ail1_empty_one(sdp, tr);
-                if (list_empty(&tr->tr_ail1_list))
+                if (list_empty(&tr->tr_ail1_list) && oldest_tr)
                        list_move(&tr->tr_list, &sdp->sd_ail2_list);
                else
-                        break;
+                        oldest_tr = 0;
        }
        ret = list_empty(&sdp->sd_ail1_list);
        spin_unlock(&sdp->sd_ail_lock);
@@ -317,7 +318,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 {
-        unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+        unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
        unsigned wanted = blks + reserved_blks;
        DEFINE_WAIT(wait);
        int did_wait = 0;
@@ -545,6 +546,76 @@ void gfs2_ordered_del_inode(struct gfs2_inode *ip)
        spin_unlock(&sdp->sd_ordered_lock);
 }
+void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
+{
+        struct buffer_head *bh = bd->bd_bh;
+        struct gfs2_glock *gl = bd->bd_gl;
+        gfs2_remove_from_ail(bd);
+        bd->bd_bh = NULL;
+        bh->b_private = NULL;
+        bd->bd_blkno = bh->b_blocknr;
+        bd->bd_ops = &gfs2_revoke_lops;
+        sdp->sd_log_num_revoke++;
+        atomic_inc(&gl->gl_revokes);
+        set_bit(GLF_LFLUSH, &gl->gl_flags);
+        list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
+}
+void gfs2_write_revokes(struct gfs2_sbd *sdp)
+{
+        struct gfs2_trans *tr;
+        struct gfs2_bufdata *bd, *tmp;
+        int have_revokes = 0;
+        int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
+        gfs2_ail1_empty(sdp);
+        spin_lock(&sdp->sd_ail_lock);
+        list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+                list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
+                        if (list_empty(&bd->bd_list)) {
+                                have_revokes = 1;
+                                goto done;
+                        }
+                }
+        }
+done:
+        spin_unlock(&sdp->sd_ail_lock);
+        if (have_revokes == 0)
+                return;
+        while (sdp->sd_log_num_revoke > max_revokes)
+                max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
+        max_revokes -= sdp->sd_log_num_revoke;
+        if (!sdp->sd_log_num_revoke) {
+                atomic_dec(&sdp->sd_log_blks_free);
+                /* If no blocks have been reserved, we need to also
+                 * reserve a block for the header */
+                if (!sdp->sd_log_blks_reserved)
+                        atomic_dec(&sdp->sd_log_blks_free);
+        }
+        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
+        list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+                list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
+                        if (max_revokes == 0)
+                                goto out_of_blocks;
+                        if (!list_empty(&bd->bd_list))
+                                continue;
+                        gfs2_add_revoke(sdp, bd);
+                        max_revokes--;
+                }
+        }
+out_of_blocks:
+        spin_unlock(&sdp->sd_ail_lock);
+        gfs2_log_unlock(sdp);
+        if (!sdp->sd_log_num_revoke) {
+                atomic_inc(&sdp->sd_log_blks_free);
+                if (!sdp->sd_log_blks_reserved)
+                        atomic_inc(&sdp->sd_log_blks_free);
+        }
+}
 /**
 * log_write_header - Get and initialize a journal header buffer
 * @sdp: The GFS2 superblock
@@ -562,7 +633,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
        lh = page_address(page);
        clear_page(lh);
-        gfs2_ail1_empty(sdp);
        tail = current_tail(sdp);
        lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 3566f35915e0..37216634f0aa 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -72,5 +72,7 @@ extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
 extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
 extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
 extern int gfs2_logd(void *data);
+extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
 #endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6c33d7b6e0c4..17c5b5d7dc88 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -16,6 +16,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/bio.h>
 #include <linux/fs.h>
+#include <linux/list_sort.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -401,6 +402,20 @@ static void gfs2_check_magic(struct buffer_head *bh)
        kunmap_atomic(kaddr);
 }
+static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        struct gfs2_bufdata *bda, *bdb;
+        bda = list_entry(a, struct gfs2_bufdata, bd_list);
+        bdb = list_entry(b, struct gfs2_bufdata, bd_list);
+        if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+                return -1;
+        if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+                return 1;
+        return 0;
+}
 static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
                                unsigned int total, struct list_head *blist,
                                bool is_databuf)
@@ -413,6 +428,7 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
        __be64 *ptr;
        gfs2_log_lock(sdp);
+        list_sort(NULL, blist, blocknr_cmp);
        bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
        while(total) {
                num = total;
@@ -590,6 +606,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
        struct page *page;
        unsigned int length;
+        gfs2_write_revokes(sdp);
        if (!sdp->sd_log_num_revoke)
                return;
@@ -836,10 +853,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
        .lo_name = "revoke",
 };
-const struct gfs2_log_operations gfs2_rg_lops = {
-        .lo_name = "rg",
-};
 const struct gfs2_log_operations gfs2_databuf_lops = {
        .lo_before_commit = databuf_lo_before_commit,
        .lo_after_commit = databuf_lo_after_commit,
@@ -851,7 +864,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
 const struct gfs2_log_operations *gfs2_log_ops[] = {
        &gfs2_databuf_lops,
        &gfs2_buf_lops,
-        &gfs2_rg_lops,
        &gfs2_revoke_lops,
        NULL,
 };
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 87e062e05c92..9ca2e6438419 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -23,7 +23,6 @@
 extern const struct gfs2_log_operations gfs2_glock_lops;
 extern const struct gfs2_log_operations gfs2_buf_lops;
 extern const struct gfs2_log_operations gfs2_revoke_lops;
-extern const struct gfs2_log_operations gfs2_rg_lops;
 extern const struct gfs2_log_operations gfs2_databuf_lops;
 extern const struct gfs2_log_operations *gfs2_log_ops[];
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 1a89afb68472..0da390686c08 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -296,10 +296,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
        if (bd) {
                spin_lock(&sdp->sd_ail_lock);
                if (bd->bd_tr) {
-                        gfs2_remove_from_ail(bd);
-                        bh->b_private = NULL;
-                        bd->bd_bh = NULL;
-                        bd->bd_blkno = bh->b_blocknr;
                        gfs2_trans_add_revoke(sdp, bd);
                }
                spin_unlock(&sdp->sd_ail_lock);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 60ede2a0f43f..0262c190b6f9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -916,16 +916,16 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
                goto fail_quotad;
        p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
-        error = IS_ERR(p);
+        if (IS_ERR(p)) {
-        if (error) {
+                error = PTR_ERR(p);
                fs_err(sdp, "can't start logd thread: %d\n", error);
                return error;
        }
        sdp->sd_logd_process = p;
        p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
-        error = IS_ERR(p);
+        if (IS_ERR(p)) {
-        if (error) {
+                error = PTR_ERR(p);
                fs_err(sdp, "can't start quotad thread: %d\n", error);
                goto fail;
        }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c253b13722e8..3768c2f40e43 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1154,11 +1154,6 @@ int gfs2_quota_sync(struct super_block *sb, int type)
        return error;
 }
-static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
-{
-        return gfs2_quota_sync(sb, type);
-}
 int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
 {
        struct gfs2_quota_data *qd;
@@ -1414,7 +1409,7 @@ int gfs2_quotad(void *data)
                                           &tune->gt_statfs_quantum);
                /* Update quota file */
-                quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
+                quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
                                   &quotad_timeo, &tune->gt_quota_quantum);
                /* Check for & recover partially truncated inodes */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9809156e3d04..69317435faa7 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1288,13 +1288,15 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
        minlen = max_t(u64, r.minlen,
                       q->limits.discard_granularity) >> bs_shift;
+        if (end <= start || minlen > sdp->sd_max_rg_data)
+                return -EINVAL;
        rgd = gfs2_blk2rgrpd(sdp, start, 0);
-        rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0);
+        rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
-        if (end <= start ||
+        if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
-            minlen > sdp->sd_max_rg_data ||
+            && (start > rgd_end->rd_data0 + rgd_end->rd_data))
-            start > rgd_end->rd_data0 + rgd_end->rd_data)
+                return -EINVAL; /* start is beyond the end of the fs */
-                return -EINVAL;
        while (1) {
@@ -1336,7 +1338,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
        }
 out:
-        r.len = trimmed << 9;
+        r.len = trimmed << bs_shift;
        if (copy_to_user(argp, &r, sizeof(r)))
                return -EFAULT;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 7374907742a8..2b20d7046bf3 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -270,19 +270,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 {
-        struct gfs2_glock *gl = bd->bd_gl;
        struct gfs2_trans *tr = current->journal_info;
        BUG_ON(!list_empty(&bd->bd_list));
-        BUG_ON(!list_empty(&bd->bd_ail_st_list));
+        gfs2_add_revoke(sdp, bd);
-        BUG_ON(!list_empty(&bd->bd_ail_gl_list));
-        bd->bd_ops = &gfs2_revoke_lops;
        tr->tr_touched = 1;
        tr->tr_num_revoke++;
-        sdp->sd_log_num_revoke++;
-        atomic_inc(&gl->gl_revokes);
-        set_bit(GLF_LFLUSH, &gl->gl_flags);
-        list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
 }
 void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index e0101b6fb0d7..145566851e7a 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -51,9 +51,9 @@ done:
 /*
 * hfs_readdir
 */
-static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        int len, err;
        char strbuf[HFS_MAX_NAMELEN];
@@ -62,7 +62,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct hfs_readdir_data *rd;
        u16 type;
-        if (filp->f_pos >= inode->i_size)
+        if (ctx->pos >= inode->i_size)
                return 0;
        err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
@@ -73,14 +73,13 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (err)
                goto out;
-        switch ((u32)filp->f_pos) {
+        if (ctx->pos == 0) {
-        case 0:
                /* This is completely artificial... */
-                if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+                if (!dir_emit_dot(file, ctx))
                        goto out;
-                filp->f_pos++;
+                ctx->pos = 1;
-                /* fall through */
+        }
-        case 1:
+        if (ctx->pos == 1) {
                if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
                        err = -EIO;
                        goto out;
@@ -97,18 +96,16 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                //      err = -EIO;
                //      goto out;
                //}
-                if (filldir(dirent, "..", 2, 1,
+                if (!dir_emit(ctx, "..", 2,
                            be32_to_cpu(entry.thread.ParID), DT_DIR))
                        goto out;
-                filp->f_pos++;
+                ctx->pos = 2;
-                /* fall through */
-        default:
-                if (filp->f_pos >= inode->i_size)
-                        goto out;
-                err = hfs_brec_goto(&fd, filp->f_pos - 1);
-                if (err)
-                        goto out;
        }
+        if (ctx->pos >= inode->i_size)
+                goto out;
+        err = hfs_brec_goto(&fd, ctx->pos - 1);
+        if (err)
+                goto out;
        for (;;) {
                if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) {
@@ -131,7 +128,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                err = -EIO;
                                goto out;
                        }
-                        if (filldir(dirent, strbuf, len, filp->f_pos,
+                        if (!dir_emit(ctx, strbuf, len,
                                    be32_to_cpu(entry.dir.DirID), DT_DIR))
                                break;
                } else if (type == HFS_CDR_FIL) {
@@ -140,7 +137,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                err = -EIO;
                                goto out;
                        }
-                        if (filldir(dirent, strbuf, len, filp->f_pos,
+                        if (!dir_emit(ctx, strbuf, len,
                                    be32_to_cpu(entry.file.FlNum), DT_REG))
                                break;
                } else {
@@ -148,22 +145,22 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        err = -EIO;
                        goto out;
                }
-                filp->f_pos++;
+                ctx->pos++;
-                if (filp->f_pos >= inode->i_size)
+                if (ctx->pos >= inode->i_size)
                        goto out;
                err = hfs_brec_goto(&fd, 1);
                if (err)
                        goto out;
        }
-        rd = filp->private_data;
+        rd = file->private_data;
        if (!rd) {
                rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL);
                if (!rd) {
                        err = -ENOMEM;
                        goto out;
                }
-                filp->private_data = rd;
+                file->private_data = rd;
-                rd->file = filp;
+                rd->file = file;
                list_add(&rd->list, &HFS_I(inode)->open_dir_list);
        }
        memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key));
@@ -306,7 +303,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 const struct file_operations hfs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = hfs_readdir,
+        .iterate        = hfs_readdir,
        .llseek         = generic_file_llseek,
        .release        = hfs_dir_release,
 };
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index a73b11839a41..0524cda47a6e 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -229,13 +229,10 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 /* string.c */
 extern const struct dentry_operations hfs_dentry_operations;
-extern int hfs_hash_dentry(const struct dentry *, const struct inode *,
+extern int hfs_hash_dentry(const struct dentry *, struct qstr *);
-                struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
                      const unsigned char *, unsigned int);
-extern int hfs_compare_dentry(const struct dentry *parent,
+extern int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
-                const struct inode *pinode,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name);
 /* trans.c */
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 495a976a3cc9..85b610c3909f 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,8 +51,7 @@ static unsigned char caseorder[256] = {
 /*
 * Hash a string to an integer in a case-independent way
 */
-int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this)
-                struct qstr *this)
 {
        const unsigned char *name = this->name;
        unsigned int hash, len = this->len;
@@ -93,8 +92,7 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
 * Test for equality of two strings in the HFS filename character ordering.
 * return 1 on failure and 0 on success
 */
-int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        const unsigned char *n1, *n2;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index a37ac934732f..d8ce4bd17fc5 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -121,9 +121,9 @@ fail:
        return ERR_PTR(err);
 }
-static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        int len, err;
        char strbuf[HFSPLUS_MAX_STRLEN + 1];
@@ -132,7 +132,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct hfsplus_readdir_data *rd;
        u16 type;
-        if (filp->f_pos >= inode->i_size)
+        if (file->f_pos >= inode->i_size)
                return 0;
        err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
@@ -143,14 +143,13 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (err)
                goto out;
-        switch ((u32)filp->f_pos) {
+        if (ctx->pos == 0) {
-        case 0:
                /* This is completely artificial... */
-                if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+                if (!dir_emit_dot(file, ctx))
                        goto out;
-                filp->f_pos++;
+                ctx->pos = 1;
-                /* fall through */
+        }
-        case 1:
+        if (ctx->pos == 1) {
                if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
                        err = -EIO;
                        goto out;
@@ -168,19 +167,16 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        err = -EIO;
                        goto out;
                }
-                if (filldir(dirent, "..", 2, 1,
+                if (!dir_emit(ctx, "..", 2,
                            be32_to_cpu(entry.thread.parentID), DT_DIR))
                        goto out;
-                filp->f_pos++;
+                ctx->pos = 2;
-                /* fall through */
-        default:
-                if (filp->f_pos >= inode->i_size)
-                        goto out;
-                err = hfs_brec_goto(&fd, filp->f_pos - 1);
-                if (err)
-                        goto out;
        }
+        if (ctx->pos >= inode->i_size)
+                goto out;
+        err = hfs_brec_goto(&fd, ctx->pos - 1);
+        if (err)
+                goto out;
        for (;;) {
                if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) {
                        pr_err("walked past end of dir\n");
@@ -211,7 +207,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                            HFSPLUS_SB(sb)->hidden_dir->i_ino ==
                                        be32_to_cpu(entry.folder.id))
                                goto next;
-                        if (filldir(dirent, strbuf, len, filp->f_pos,
+                        if (!dir_emit(ctx, strbuf, len,
                                    be32_to_cpu(entry.folder.id), DT_DIR))
                                break;
                } else if (type == HFSPLUS_FILE) {
@@ -220,7 +216,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                err = -EIO;
                                goto out;
                        }
-                        if (filldir(dirent, strbuf, len, filp->f_pos,
+                        if (!dir_emit(ctx, strbuf, len,
                                    be32_to_cpu(entry.file.id), DT_REG))
                                break;
                } else {
@@ -229,22 +225,22 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        goto out;
                }
 next:
-                filp->f_pos++;
+                ctx->pos++;
-                if (filp->f_pos >= inode->i_size)
+                if (ctx->pos >= inode->i_size)
                        goto out;
                err = hfs_brec_goto(&fd, 1);
                if (err)
                        goto out;
        }
-        rd = filp->private_data;
+        rd = file->private_data;
        if (!rd) {
                rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL);
                if (!rd) {
                        err = -ENOMEM;
                        goto out;
                }
-                filp->private_data = rd;
+                file->private_data = rd;
-                rd->file = filp;
+                rd->file = file;
                list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
        }
        memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
@@ -538,7 +534,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 const struct file_operations hfsplus_dir_operations = {
        .fsync          = hfsplus_file_fsync,
        .read           = generic_read_dir,
-        .readdir        = hfsplus_readdir,
+        .iterate        = hfsplus_readdir,
        .unlocked_ioctl = hfsplus_ioctl,
        .llseek         = generic_file_llseek,
        .release        = hfsplus_dir_release,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 60b0a3388b26..ede79317cfb8 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -495,11 +495,8 @@ int hfsplus_uni2asc(struct super_block *,
                const struct hfsplus_unistr *, char *, int *);
 int hfsplus_asc2uni(struct super_block *,
                struct hfsplus_unistr *, int, const char *, int);
-int hfsplus_hash_dentry(const struct dentry *dentry,
+int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str);
-                const struct inode *inode, struct qstr *str);
+int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
-int hfsplus_compare_dentry(const struct dentry *parent,
-                const struct inode *pinode,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name);
 /* wrapper.c */
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 2c2e47dcfdd8..e8ef121a4d8b 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -334,8 +334,7 @@ int hfsplus_asc2uni(struct super_block *sb,
 * Composed unicode characters are decomposed and case-folding is performed
 * if the appropriate bits are (un)set on the superblock.
 */
-int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
-                struct qstr *str)
 {
        struct super_block *sb = dentry->d_sb;
        const char *astr;
@@ -386,9 +385,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
 * Composed unicode characters are decomposed and case-folding is performed
 * if the appropriate bits are (un)set on the superblock.
 */
-int hfsplus_compare_dentry(const struct dentry *parent,
+int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
-                const struct inode *pinode,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        struct super_block *sb = parent->d_sb;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 32f35f187989..cddb05217512 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -277,7 +277,7 @@ static const struct super_operations hostfs_sbops = {
        .show_options   = hostfs_show_options,
 };
-int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
+int hostfs_readdir(struct file *file, struct dir_context *ctx)
 {
        void *dir;
        char *name;
@@ -292,12 +292,11 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
        __putname(name);
        if (dir == NULL)
                return -error;
-        next = file->f_pos;
+        next = ctx->pos;
        while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
-                error = (*filldir)(ent, name, len, file->f_pos,
+                if (!dir_emit(ctx, name, len, ino, type))
-                                   ino, type);
+                        break;
-                if (error) break;
+                ctx->pos = next;
-                file->f_pos = next;
        }
        close_dir(dir);
        return 0;
@@ -393,7 +392,7 @@ static const struct file_operations hostfs_file_fops = {
 static const struct file_operations hostfs_dir_fops = {
        .llseek         = generic_file_llseek,
-        .readdir        = hostfs_readdir,
+        .iterate        = hostfs_readdir,
        .read           = generic_read_dir,
 };
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index f49d1498aa2e..4d0a1afa058c 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -7,8 +7,37 @@
 */
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include "hpfs_fn.h"
+void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n)
+{
+        struct buffer_head *bh;
+        struct blk_plug plug;
+        if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size))
+                return;
+        bh = sb_find_get_block(s, secno);
+        if (bh) {
+                if (buffer_uptodate(bh)) {
+                        brelse(bh);
+                        return;
+                }
+                brelse(bh);
+        };
+        blk_start_plug(&plug);
+        while (n > 0) {
+                if (unlikely(secno >= hpfs_sb(s)->sb_fs_size))
+                        break;
+                sb_breadahead(s, secno);
+                secno++;
+                n--;
+        }
+        blk_finish_plug(&plug);
+}
 /* Map a sector into a buffer and return pointers to it and to the buffer. */
 void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp,
@@ -18,6 +47,8 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head
        hpfs_lock_assert(s);
+        hpfs_prefetch_sectors(s, secno, ahead);
        cond_resched();
        *bhp = bh = sb_bread(s, secno);
@@ -67,6 +98,8 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
                return NULL;
        }
+        hpfs_prefetch_sectors(s, secno, 4 + ahead);
        qbh->data = data = kmalloc(2048, GFP_NOFS);
        if (!data) {
                printk("HPFS: hpfs_map_4sectors: out of memory\n");
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 05d4816e4e77..fa27980f2229 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,8 +12,7 @@
 * Note: the dentry argument is the parent dentry.
 */
-static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
-                struct qstr *qstr)
 {
        unsigned long    hash;
        int              i;
@@ -35,9 +34,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *ino
        return 0;
 }
-static int hpfs_compare_dentry(const struct dentry *parent,
+static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
-                const struct inode *pinode,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        unsigned al = len;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 834ac13c04b7..292b1acb9b81 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -57,14 +57,14 @@ fail:
        return -ESPIPE;
 }
-static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hpfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
        struct quad_buffer_head qbh;
        struct hpfs_dirent *de;
        int lc;
-        long old_pos;
+        loff_t next_pos;
        unsigned char *tempname;
        int c1, c2 = 0;
        int ret = 0;
@@ -105,11 +105,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                }
        }
        lc = hpfs_sb(inode->i_sb)->sb_lowercase;
-        if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */
+        if (ctx->pos == 12) { /* diff -r requires this (note, that diff -r */
-                filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */
+                ctx->pos = 13; /* also fails on msdos filesystem in 2.0) */
                goto out;
        }
-        if (filp->f_pos == 13) {
+        if (ctx->pos == 13) {
                ret = -ENOENT;
                goto out;
        }
@@ -120,33 +120,34 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                   accepted by filldir, but what can I do?
                   maybe killall -9 ls helps */
                if (hpfs_sb(inode->i_sb)->sb_chk)
-                        if (hpfs_stop_cycles(inode->i_sb, filp->f_pos, &c1, &c2, "hpfs_readdir")) {
+                        if (hpfs_stop_cycles(inode->i_sb, ctx->pos, &c1, &c2, "hpfs_readdir")) {
                                ret = -EFSERROR;
                                goto out;
                        }
-                if (filp->f_pos == 12)
+                if (ctx->pos == 12)
                        goto out;
-                if (filp->f_pos == 3 || filp->f_pos == 4 || filp->f_pos == 5) {
+                if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) {
-                        printk("HPFS: warning: pos==%d\n",(int)filp->f_pos);
+                        printk("HPFS: warning: pos==%d\n",(int)ctx->pos);
                        goto out;
                }
-                if (filp->f_pos == 0) {
+                if (ctx->pos == 0) {
-                        if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+                        if (!dir_emit_dot(file, ctx))
                                goto out;
-                        filp->f_pos = 11;
+                        ctx->pos = 11;
                }
-                if (filp->f_pos == 11) {
+                if (ctx->pos == 11) {
-                        if (filldir(dirent, "..", 2, filp->f_pos, hpfs_inode->i_parent_dir, DT_DIR) < 0)
+                        if (!dir_emit(ctx, "..", 2, hpfs_inode->i_parent_dir, DT_DIR))
                                goto out;
-                        filp->f_pos = 1;
+                        ctx->pos = 1;
                }
-                if (filp->f_pos == 1) {
+                if (ctx->pos == 1) {
-                        filp->f_pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
+                        ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
-                        hpfs_add_pos(inode, &filp->f_pos);
+                        hpfs_add_pos(inode, &file->f_pos);
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
                }
-                old_pos = filp->f_pos;
+                next_pos = ctx->pos;
-                if (!(de = map_pos_dirent(inode, &filp->f_pos, &qbh))) {
+                if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) {
+                        ctx->pos = next_pos;
                        ret = -EIOERROR;
                        goto out;
                }
@@ -154,20 +155,21 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        if (hpfs_sb(inode->i_sb)->sb_chk) {
                                if (de->first && !de->last && (de->namelen != 2
                                    || de ->name[0] != 1 || de->name[1] != 1))
-                                        hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", old_pos);
+                                        hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", (unsigned long)ctx->pos);
                                if (de->last && (de->namelen != 1 || de ->name[0] != 255))
-                                        hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", old_pos);
+                                        hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", (unsigned long)ctx->pos);
                        }
                        hpfs_brelse4(&qbh);
+                        ctx->pos = next_pos;
                        goto again;
                }
                tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
-                if (filldir(dirent, tempname, de->namelen, old_pos, le32_to_cpu(de->fnode), DT_UNKNOWN) < 0) {
+                if (!dir_emit(ctx, tempname, de->namelen, le32_to_cpu(de->fnode), DT_UNKNOWN)) {
-                        filp->f_pos = old_pos;
                        if (tempname != de->name) kfree(tempname);
                        hpfs_brelse4(&qbh);
                        goto out;
                }
+                ctx->pos = next_pos;
                if (tempname != de->name) kfree(tempname);
                hpfs_brelse4(&qbh);
        }
@@ -322,7 +324,7 @@ const struct file_operations hpfs_dir_ops =
 {
        .llseek         = hpfs_dir_lseek,
        .read           = generic_read_dir,
-        .readdir        = hpfs_readdir,
+        .iterate        = hpfs_readdir,
        .release        = hpfs_dir_release,
        .fsync          = hpfs_file_fsync,
 };
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index e4ba5fe4c3b5..4e9dabcf1f4c 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -7,6 +7,7 @@
 */
 #include "hpfs_fn.h"
+#include <linux/mpage.h>
 #define BLOCKS(size) (((size) + 511) >> 9)
@@ -34,7 +35,7 @@ int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 * so we must ignore such errors.
 */
-static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
+static secno hpfs_bmap(struct inode *inode, unsigned file_secno, unsigned *n_secs)
 {
        struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
        unsigned n, disk_secno;
@@ -42,11 +43,20 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
        struct buffer_head *bh;
        if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0;
        n = file_secno - hpfs_inode->i_file_sec;
-        if (n < hpfs_inode->i_n_secs) return hpfs_inode->i_disk_sec + n;
+        if (n < hpfs_inode->i_n_secs) {
+                *n_secs = hpfs_inode->i_n_secs - n;
+                return hpfs_inode->i_disk_sec + n;
+        }
        if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0;
        disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh);
        if (disk_secno == -1) return 0;
        if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0;
+        n = file_secno - hpfs_inode->i_file_sec;
+        if (n < hpfs_inode->i_n_secs) {
+                *n_secs = hpfs_inode->i_n_secs - n;
+                return hpfs_inode->i_disk_sec + n;
+        }
+        *n_secs = 1;
        return disk_secno;
 }
@@ -67,10 +77,14 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
 {
        int r;
        secno s;
+        unsigned n_secs;
        hpfs_lock(inode->i_sb);
-        s = hpfs_bmap(inode, iblock);
+        s = hpfs_bmap(inode, iblock, &n_secs);
        if (s) {
+                if (bh_result->b_size >> 9 < n_secs)
+                        n_secs = bh_result->b_size >> 9;
                map_bh(bh_result, inode->i_sb, s);
+                bh_result->b_size = n_secs << 9;
                goto ret_0;
        }
        if (!create) goto ret_0;
@@ -95,14 +109,26 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
        return r;
 }
+static int hpfs_readpage(struct file *file, struct page *page)
+{
+        return mpage_readpage(page, hpfs_get_block);
+}
 static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-        return block_write_full_page(page,hpfs_get_block, wbc);
+        return block_write_full_page(page, hpfs_get_block, wbc);
 }
-static int hpfs_readpage(struct file *file, struct page *page)
+static int hpfs_readpages(struct file *file, struct address_space *mapping,
+                          struct list_head *pages, unsigned nr_pages)
+{
+        return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block);
+}
+static int hpfs_writepages(struct address_space *mapping,
+                           struct writeback_control *wbc)
 {
-        return block_read_full_page(page,hpfs_get_block);
+        return mpage_writepages(mapping, wbc, hpfs_get_block);
 }
 static void hpfs_write_failed(struct address_space *mapping, loff_t to)
@@ -161,6 +187,8 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations hpfs_aops = {
        .readpage = hpfs_readpage,
        .writepage = hpfs_writepage,
+        .readpages = hpfs_readpages,
+        .writepages = hpfs_writepages,
        .write_begin = hpfs_write_begin,
        .write_end = hpfs_write_end,
        .bmap = _hpfs_bmap
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b7ae286646b5..1b398636e990 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -27,8 +27,9 @@
 #define ALLOC_FWD_MAX   128
 #define ALLOC_M         1
 #define FNODE_RD_AHEAD  16
-#define ANODE_RD_AHEAD  16
+#define ANODE_RD_AHEAD  0
-#define DNODE_RD_AHEAD  4
+#define DNODE_RD_AHEAD  72
+#define COUNT_RD_AHEAD  62
 #define FREE_DNODES_ADD 58
 #define FREE_DNODES_DEL 29
@@ -207,6 +208,7 @@ void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
 /* buffer.c */
+void hpfs_prefetch_sectors(struct super_block *, unsigned, int);
 void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int);
 void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **);
 void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int);
@@ -271,6 +273,7 @@ void hpfs_evict_inode(struct inode *);
 __le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
 __le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
+void hpfs_prefetch_bitmap(struct super_block *, unsigned);
 unsigned char *hpfs_load_code_page(struct super_block *, secno);
 __le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
 struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index 4acb19d78359..3aa66ae1031e 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -17,7 +17,9 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
                         struct quad_buffer_head *qbh, char *id)
 {
        secno sec;
-        if (hpfs_sb(s)->sb_chk) if (bmp_block * 16384 > hpfs_sb(s)->sb_fs_size) {
+        __le32 *ret;
+        unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
+        if (hpfs_sb(s)->sb_chk) if (bmp_block >= n_bands) {
                hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id);
                return NULL;
        }
@@ -26,7 +28,23 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
                hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id);
                return NULL;
        }
-        return hpfs_map_4sectors(s, sec, qbh, 4);
+        ret = hpfs_map_4sectors(s, sec, qbh, 4);
+        if (ret) hpfs_prefetch_bitmap(s, bmp_block + 1);
+        return ret;
+}
+void hpfs_prefetch_bitmap(struct super_block *s, unsigned bmp_block)
+{
+        unsigned to_prefetch, next_prefetch;
+        unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
+        if (unlikely(bmp_block >= n_bands))
+                return;
+        to_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]);
+        if (unlikely(bmp_block + 1 >= n_bands))
+                next_prefetch = 0;
+        else
+                next_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block + 1]);
+        hpfs_prefetch_sectors(s, to_prefetch, 4 + 4 * (to_prefetch + 4 == next_prefetch));
 }
 /*
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a0617e706957..4334cda8dba1 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -121,7 +121,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
        unsigned long *bits;
        unsigned count;
-        bits = hpfs_map_4sectors(s, secno, &qbh, 4);
+        bits = hpfs_map_4sectors(s, secno, &qbh, 0);
        if (!bits)
                return 0;
        count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
@@ -134,8 +134,13 @@ static unsigned count_bitmaps(struct super_block *s)
        unsigned n, count, n_bands;
        n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
        count = 0;
-        for (n = 0; n < n_bands; n++)
+        for (n = 0; n < COUNT_RD_AHEAD; n++) {
+                hpfs_prefetch_bitmap(s, n);
+        }
+        for (n = 0; n < n_bands; n++) {
+                hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD);
                count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+        }
        return count;
 }
@@ -558,7 +563,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
        sbi->sb_cp_table = NULL;
        sbi->sb_c_bitmap = -1;
        sbi->sb_max_fwd_alloc = 0xffffff;
-        
+        if (sbi->sb_fs_size >= 0x80000000) {
+                hpfs_error(s, "invalid size in superblock: %08x",
+                        (unsigned)sbi->sb_fs_size);
+                goto bail4;
+        }
        /* Load bitmap directory */
        if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps))))
                goto bail4;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index cd3e38972c86..4338ff32959d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -69,7 +69,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
        struct dentry *parent;
        char *root, *name;
        const char *seg_name;
-        int len, seg_len;
+        int len, seg_len, root_len;
        len = 0;
        parent = dentry;
@@ -81,7 +81,8 @@ static char *dentry_name(struct dentry *dentry, int extra)
        }
        root = "proc";
-        len += strlen(root);
+        root_len = strlen(root);
+        len += root_len;
        name = kmalloc(len + extra + 1, GFP_KERNEL);
        if (name == NULL)
                return NULL;
@@ -91,7 +92,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
        while (parent->d_parent != parent) {
                if (is_pid(parent)) {
                        seg_name = "pid";
-                        seg_len = strlen("pid");
+                        seg_len = strlen(seg_name);
                }
                else {
                        seg_name = parent->d_name.name;
@@ -100,10 +101,10 @@ static char *dentry_name(struct dentry *dentry, int extra)
                len -= seg_len + 1;
                name[len] = '/';
-                strncpy(&name[len + 1], seg_name, seg_len);
+                memcpy(&name[len + 1], seg_name, seg_len);
                parent = parent->d_parent;
        }
-        strncpy(name, root, strlen(root));
+        memcpy(name, root, root_len);
        return name;
 }
@@ -542,8 +543,8 @@ static const struct file_operations hppfs_file_fops = {
 };
 struct hppfs_dirent {
-        void *vfs_dirent;
+        struct dir_context ctx;
-        filldir_t filldir;
+        struct dir_context *caller;
        struct dentry *dentry;
 };
@@ -555,34 +556,29 @@ static int hppfs_filldir(void *d, const char *name, int size,
        if (file_removed(dirent->dentry, name))
                return 0;
-        return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset,
+        dirent->caller->pos = dirent->ctx.pos;
-                                  inode, type);
+        return !dir_emit(dirent->caller, name, size, inode, type);
 }
-static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
+static int hppfs_readdir(struct file *file, struct dir_context *ctx)
 {
        struct hppfs_private *data = file->private_data;
        struct file *proc_file = data->proc_file;
-        int (*readdir)(struct file *, void *, filldir_t);
+        struct hppfs_dirent d = {
-        struct hppfs_dirent dirent = ((struct hppfs_dirent)
+                .ctx.actor      = hppfs_filldir,
-                                      { .vfs_dirent     = ent,
+                .caller         = ctx,
-                                        .filldir        = filldir,
+                .dentry         = file->f_path.dentry
-                                        .dentry         = file->f_path.dentry
+        };
-                                      });
        int err;
+        proc_file->f_pos = ctx->pos;
-        readdir = file_inode(proc_file)->i_fop->readdir;
+        err = iterate_dir(proc_file, &d.ctx);
+        ctx->pos = d.ctx.pos;
-        proc_file->f_pos = file->f_pos;
-        err = (*readdir)(proc_file, &dirent, hppfs_filldir);
-        file->f_pos = proc_file->f_pos;
        return err;
 }
 static const struct file_operations hppfs_dir_fops = {
        .owner          = NULL,
-        .readdir        = hppfs_readdir,
+        .iterate        = hppfs_readdir,
        .open           = hppfs_dir_open,
        .llseek         = default_llseek,
        .release        = hppfs_release,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a3f868ae3fd4..34423978b170 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -463,6 +463,14 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
        return inode;
 }
+/*
+ * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never
+ * be taken from reclaim -- unlike regular filesystems. This needs an
+ * annotation because huge_pmd_share() does an allocation under
+ * i_mmap_mutex.
+ */
+struct lock_class_key hugetlbfs_i_mmap_mutex_key;
 static struct inode *hugetlbfs_get_inode(struct super_block *sb,
                                        struct inode *dir,
                                        umode_t mode, dev_t dev)
@@ -474,6 +482,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
                struct hugetlbfs_inode_info *info;
                inode->i_ino = get_next_ino();
                inode_init_owner(inode, dir, mode);
+                lockdep_set_class(&inode->i_mapping->i_mmap_mutex,
+                                &hugetlbfs_i_mmap_mutex_key);
                inode->i_mapping->a_ops = &hugetlbfs_aops;
                inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 00d5fc3b86e1..d6dfb09c8280 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -333,8 +333,10 @@ EXPORT_SYMBOL(set_nlink);
 */
 void inc_nlink(struct inode *inode)
 {
-        if (WARN_ON(inode->i_nlink == 0))
+        if (unlikely(inode->i_nlink == 0)) {
+                WARN_ON(!(inode->i_state & I_LINKABLE));
                atomic_long_dec(&inode->i_sb->s_remove_count);
+        }
        inode->__i_nlink++;
 }
diff --git a/fs/internal.h b/fs/internal.h
index 68121584ae37..7c5f01cf619d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -96,11 +96,12 @@ struct open_flags {
        umode_t mode;
        int acc_mode;
        int intent;
+        int lookup_flags;
 };
 extern struct file *do_filp_open(int dfd, struct filename *pathname,
-                const struct open_flags *op, int flags);
+                const struct open_flags *op);
 extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
-                const char *, const struct open_flags *, int lookup_flags);
+                const char *, const struct open_flags *);
 extern long do_handle_open(int mountdirfd,
                           struct file_handle __user *ufh, int open_flag);
@@ -130,6 +131,7 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
 * read_write.c
 */
 extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
+extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
 /*
 * splice.c
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index a7d5c3c3d4e6..b943cbd963bb 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -78,8 +78,8 @@ int get_acorn_filename(struct iso_directory_record *de,
 /*
 * This should _really_ be cleaned up some day..
 */
-static int do_isofs_readdir(struct inode *inode, struct file *filp,
+static int do_isofs_readdir(struct inode *inode, struct file *file,
-                void *dirent, filldir_t filldir,
+                struct dir_context *ctx,
                char *tmpname, struct iso_directory_record *tmpde)
 {
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
@@ -94,10 +94,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
        struct iso_directory_record *de;
        struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
-        offset = filp->f_pos & (bufsize - 1);
+        offset = ctx->pos & (bufsize - 1);
-        block = filp->f_pos >> bufbits;
+        block = ctx->pos >> bufbits;
-        while (filp->f_pos < inode->i_size) {
+        while (ctx->pos < inode->i_size) {
                int de_len;
                if (!bh) {
@@ -108,7 +108,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                de = (struct iso_directory_record *) (bh->b_data + offset);
-                de_len = *(unsigned char *) de;
+                de_len = *(unsigned char *)de;
                /*
                 * If the length byte is zero, we should move on to the next
@@ -119,8 +119,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                if (de_len == 0) {
                        brelse(bh);
                        bh = NULL;
-                        filp->f_pos = (filp->f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
+                        ctx->pos = (ctx->pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
-                        block = filp->f_pos >> bufbits;
+                        block = ctx->pos >> bufbits;
                        offset = 0;
                        continue;
                }
@@ -164,16 +164,16 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                if (de->flags[-sbi->s_high_sierra] & 0x80) {
                        first_de = 0;
-                        filp->f_pos += de_len;
+                        ctx->pos += de_len;
                        continue;
                }
                first_de = 1;
                /* Handle the case of the '.' directory */
                if (de->name_len[0] == 1 && de->name[0] == 0) {
-                        if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+                        if (!dir_emit_dot(file, ctx))
                                break;
-                        filp->f_pos += de_len;
+                        ctx->pos += de_len;
                        continue;
                }
@@ -181,10 +181,9 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                /* Handle the case of the '..' directory */
                if (de->name_len[0] == 1 && de->name[0] == 1) {
-                        inode_number = parent_ino(filp->f_path.dentry);
+                        if (!dir_emit_dotdot(file, ctx))
-                        if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0)
                                break;
-                        filp->f_pos += de_len;
+                        ctx->pos += de_len;
                        continue;
                }
@@ -198,7 +197,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
                    (!sbi->s_showassoc &&
                                (de->flags[-sbi->s_high_sierra] & 4))) {
-                        filp->f_pos += de_len;
+                        ctx->pos += de_len;
                        continue;
                }
@@ -230,10 +229,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                        }
                }
                if (len > 0) {
-                        if (filldir(dirent, p, len, filp->f_pos, inode_number, DT_UNKNOWN) < 0)
+                        if (!dir_emit(ctx, p, len, inode_number, DT_UNKNOWN))
                                break;
                }
-                filp->f_pos += de_len;
+                ctx->pos += de_len;
                continue;
        }
@@ -247,13 +246,12 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
 * handling split directory entries.. The real work is done by
 * "do_isofs_readdir()".
 */
-static int isofs_readdir(struct file *filp,
+static int isofs_readdir(struct file *file, struct dir_context *ctx)
-                void *dirent, filldir_t filldir)
 {
        int result;
        char *tmpname;
        struct iso_directory_record *tmpde;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        tmpname = (char *)__get_free_page(GFP_KERNEL);
        if (tmpname == NULL)
@@ -261,7 +259,7 @@ static int isofs_readdir(struct file *filp,
        tmpde = (struct iso_directory_record *) (tmpname+1024);
-        result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
+        result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde);
        free_page((unsigned long) tmpname);
        return result;
@@ -271,7 +269,7 @@ const struct file_operations isofs_dir_operations =
 {
        .llseek = generic_file_llseek,
        .read = generic_read_dir,
-        .readdir = isofs_readdir,
+        .iterate = isofs_readdir,
 };
 /*
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d9b8aebdeb22..c348d6d88624 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -28,31 +28,23 @@
 #define BEQUIET
-static int isofs_hashi(const struct dentry *parent, const struct inode *inode,
+static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
-                struct qstr *qstr);
+static int isofs_hash(const struct dentry *parent, struct qstr *qstr);
-static int isofs_hash(const struct dentry *parent, const struct inode *inode,
-                struct qstr *qstr);
 static int isofs_dentry_cmpi(const struct dentry *parent,
-                const struct inode *pinode,
+                const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name);
 static int isofs_dentry_cmp(const struct dentry *parent,
-                const struct inode *pinode,
+                const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name);
 #ifdef CONFIG_JOLIET
-static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode,
+static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
-                struct qstr *qstr);
+static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr);
-static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
-                struct qstr *qstr);
 static int isofs_dentry_cmpi_ms(const struct dentry *parent,
-                const struct inode *pinode,
+                const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name);
 static int isofs_dentry_cmp_ms(const struct dentry *parent,
-                const struct inode *pinode,
+                const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name);
 #endif
@@ -265,30 +257,26 @@ static int isofs_dentry_cmp_common(
 }
 static int
-isofs_hash(const struct dentry *dentry, const struct inode *inode,
+isofs_hash(const struct dentry *dentry, struct qstr *qstr)
-                struct qstr *qstr)
 {
        return isofs_hash_common(dentry, qstr, 0);
 }
 static int
-isofs_hashi(const struct dentry *dentry, const struct inode *inode,
+isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
-                struct qstr *qstr)
 {
        return isofs_hashi_common(dentry, qstr, 0);
 }
 static int
-isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode,
+isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        return isofs_dentry_cmp_common(len, str, name, 0, 0);
 }
 static int
-isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
+isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        return isofs_dentry_cmp_common(len, str, name, 0, 1);
@@ -296,30 +284,26 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
 #ifdef CONFIG_JOLIET
 static int
-isofs_hash_ms(const struct dentry *dentry, const struct inode *inode,
+isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
-                struct qstr *qstr)
 {
        return isofs_hash_common(dentry, qstr, 1);
 }
 static int
-isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode,
+isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
-                struct qstr *qstr)
 {
        return isofs_hashi_common(dentry, qstr, 1);
 }
 static int
-isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode,
+isofs_dentry_cmp_ms(const struct dentry *parent, const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        return isofs_dentry_cmp_common(len, str, name, 1, 0);
 }
 static int
-isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode,
+isofs_dentry_cmpi_ms(const struct dentry *parent, const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        return isofs_dentry_cmp_common(len, str, name, 1, 1);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index c167028844ed..95295640d9c8 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -37,8 +37,7 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
        qstr.name = compare;
        qstr.len = dlen;
-        return dentry->d_op->d_compare(NULL, NULL, NULL, NULL,
+        return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
-                        dentry->d_name.len, dentry->d_name.name, &qstr);
 }
 /*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e3e255c0a509..be0c39b66fe0 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -2019,16 +2019,20 @@ zap_buffer_unlocked:
 * void journal_invalidatepage() - invalidate a journal page
 * @journal: journal to use for flush
 * @page:    page to flush
- * @offset:  length of page to invalidate.
+ * @offset:  offset of the range to invalidate
+ * @length:  length of the range to invalidate
 *
- * Reap page buffers containing data after offset in page.
+ * Reap page buffers containing data in specified range in page.
 */
 void journal_invalidatepage(journal_t *journal,
                      struct page *page,
-                      unsigned long offset)
+                      unsigned int offset,
+                      unsigned int length)
 {
        struct buffer_head *head, *bh, *next;
+        unsigned int stop = offset + length;
        unsigned int curr_off = 0;
+        int partial_page = (offset || length < PAGE_CACHE_SIZE);
        int may_free = 1;
        if (!PageLocked(page))
@@ -2036,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal,
        if (!page_has_buffers(page))
                return;
+        BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
        /* We will potentially be playing with lists other than just the
         * data lists (especially for journaled data mode), so be
         * cautious in our locking. */
@@ -2045,11 +2051,14 @@ void journal_invalidatepage(journal_t *journal,
                unsigned int next_off = curr_off + bh->b_size;
                next = bh->b_this_page;
+                if (next_off > stop)
+                        return;
                if (offset <= curr_off) {
                        /* This block is wholly outside the truncation point */
                        lock_buffer(bh);
                        may_free &= journal_unmap_buffer(journal, bh,
-                                                         offset > 0);
+                                                         partial_page);
                        unlock_buffer(bh);
                }
                curr_off = next_off;
@@ -2057,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal,
        } while (bh != head);
-        if (!offset) {
+        if (!partial_page) {
                if (may_free && try_to_free_buffers(page))
                        J_ASSERT(!page_has_buffers(page));
        }
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 69a48c2944da..5a9f5534d57b 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -20,7 +20,7 @@ config JBD2
 config JBD2_DEBUG
        bool "JBD2 (ext4) debugging support"
-        depends on JBD2 && DEBUG_FS
+        depends on JBD2
        help
          If you are using the ext4 journaled file system (or
          potentially any other filesystem/device using JBD2), this option
@@ -29,7 +29,7 @@ config JBD2_DEBUG
          By default, the debugging output will be turned off.
          If you select Y here, then you will be able to turn on debugging
-          with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+          with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
          number between 1 and 5. The higher the number, the more debugging
          output is generated.  To turn debugging off again, do
-          "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
+          "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index c78841ee81cf..7f34f4716165 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -120,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
        int nblocks, space_left;
        /* assert_spin_locked(&journal->j_state_lock); */
-        nblocks = jbd_space_needed(journal);
+        nblocks = jbd2_space_needed(journal);
-        while (__jbd2_log_space_left(journal) < nblocks) {
+        while (jbd2_log_space_left(journal) < nblocks) {
                if (journal->j_flags & JBD2_ABORT)
                        return;
                write_unlock(&journal->j_state_lock);
@@ -140,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
                 */
                write_lock(&journal->j_state_lock);
                spin_lock(&journal->j_list_lock);
-                nblocks = jbd_space_needed(journal);
+                nblocks = jbd2_space_needed(journal);
-                space_left = __jbd2_log_space_left(journal);
+                space_left = jbd2_log_space_left(journal);
                if (space_left < nblocks) {
                        int chkpt = journal->j_checkpoint_transactions != NULL;
                        tid_t tid = 0;
@@ -156,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal)
                                /* We were able to recover space; yay! */
                                ;
                        } else if (tid) {
+                                /*
+                                 * jbd2_journal_commit_transaction() may want
+                                 * to take the checkpoint_mutex if JBD2_FLUSHED
+                                 * is set.  So we need to temporarily drop it.
+                                 */
+                                mutex_unlock(&journal->j_checkpoint_mutex);
                                jbd2_log_wait_commit(journal, tid);
+                                write_lock(&journal->j_state_lock);
+                                continue;
                        } else {
                                printk(KERN_ERR "%s: needed %d blocks and "
                                       "only had %d space available\n",
@@ -625,10 +633,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
        __jbd2_journal_drop_transaction(journal, transaction);
        jbd2_journal_free_transaction(transaction);
-        /* Just in case anybody was waiting for more transactions to be
-           checkpointed... */
-        wake_up(&journal->j_wait_logspace);
        ret = 1;
 out:
        return ret;
@@ -690,9 +694,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
        J_ASSERT(transaction->t_state == T_FINISHED);
        J_ASSERT(transaction->t_buffers == NULL);
        J_ASSERT(transaction->t_forget == NULL);
-        J_ASSERT(transaction->t_iobuf_list == NULL);
        J_ASSERT(transaction->t_shadow_list == NULL);
-        J_ASSERT(transaction->t_log_list == NULL);
        J_ASSERT(transaction->t_checkpoint_list == NULL);
        J_ASSERT(transaction->t_checkpoint_io_list == NULL);
        J_ASSERT(atomic_read(&transaction->t_updates) == 0);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0f53946f13c1..559bec1a37b4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -30,15 +30,22 @@
 #include <trace/events/jbd2.h>
 /*
- * Default IO end handler for temporary BJ_IO buffer_heads.
+ * IO end handler for temporary buffer_heads handling writes to the journal.
 */
 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 {
+        struct buffer_head *orig_bh = bh->b_private;
        BUFFER_TRACE(bh, "");
        if (uptodate)
                set_buffer_uptodate(bh);
        else
                clear_buffer_uptodate(bh);
+        if (orig_bh) {
+                clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
+                smp_mb__after_clear_bit();
+                wake_up_bit(&orig_bh->b_state, BH_Shadow);
+        }
        unlock_buffer(bh);
 }
@@ -85,8 +92,7 @@ nope:
        __brelse(bh);
 }
-static void jbd2_commit_block_csum_set(journal_t *j,
+static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
-                                       struct journal_head *descriptor)
 {
        struct commit_header *h;
        __u32 csum;
@@ -94,12 +100,11 @@ static void jbd2_commit_block_csum_set(journal_t *j,
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return;
-        h = (struct commit_header *)(jh2bh(descriptor)->b_data);
+        h = (struct commit_header *)(bh->b_data);
        h->h_chksum_type = 0;
        h->h_chksum_size = 0;
        h->h_chksum[0] = 0;
-        csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+        csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
-                           j->j_blocksize);
        h->h_chksum[0] = cpu_to_be32(csum);
 }
@@ -116,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal,
                                        struct buffer_head **cbh,
                                        __u32 crc32_sum)
 {
-        struct journal_head *descriptor;
        struct commit_header *tmp;
        struct buffer_head *bh;
        int ret;
@@ -127,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal,
        if (is_journal_aborted(journal))
                return 0;
-        descriptor = jbd2_journal_get_descriptor_buffer(journal);
+        bh = jbd2_journal_get_descriptor_buffer(journal);
-        if (!descriptor)
+        if (!bh)
                return 1;
-        bh = jh2bh(descriptor);
        tmp = (struct commit_header *)bh->b_data;
        tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
        tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
@@ -146,9 +148,9 @@ static int journal_submit_commit_record(journal_t *journal,
                tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
                tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
        }
-        jbd2_commit_block_csum_set(journal, descriptor);
+        jbd2_commit_block_csum_set(journal, bh);
-        JBUFFER_TRACE(descriptor, "submit commit block");
+        BUFFER_TRACE(bh, "submit commit block");
        lock_buffer(bh);
        clear_buffer_dirty(bh);
        set_buffer_uptodate(bh);
@@ -180,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal,
        if (unlikely(!buffer_uptodate(bh)))
                ret = -EIO;
        put_bh(bh);            /* One for getblk() */
-        jbd2_journal_put_journal_head(bh2jh(bh));
        return ret;
 }
@@ -321,7 +322,7 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 }
 static void jbd2_descr_block_csum_set(journal_t *j,
-                                      struct journal_head *descriptor)
+                                      struct buffer_head *bh)
 {
        struct jbd2_journal_block_tail *tail;
        __u32 csum;
@@ -329,12 +330,10 @@ static void jbd2_descr_block_csum_set(journal_t *j,
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return;
-        tail = (struct jbd2_journal_block_tail *)
+        tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
-                        (jh2bh(descriptor)->b_data + j->j_blocksize -
                        sizeof(struct jbd2_journal_block_tail));
        tail->t_checksum = 0;
-        csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+        csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
-                           j->j_blocksize);
        tail->t_checksum = cpu_to_be32(csum);
 }
@@ -343,20 +342,21 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
 {
        struct page *page = bh->b_page;
        __u8 *addr;
-        __u32 csum;
+        __u32 csum32;
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return;
        sequence = cpu_to_be32(sequence);
        addr = kmap_atomic(page);
-        csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+        csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
-                          sizeof(sequence));
+                             sizeof(sequence));
-        csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
+        csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
-                          bh->b_size);
+                             bh->b_size);
        kunmap_atomic(addr);
-        tag->t_checksum = cpu_to_be32(csum);
+        /* We only have space to store the lower 16 bits of the crc32c. */
+        tag->t_checksum = cpu_to_be16(csum32);
 }
 /*
 * jbd2_journal_commit_transaction
@@ -368,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 {
        struct transaction_stats_s stats;
        transaction_t *commit_transaction;
-        struct journal_head *jh, *new_jh, *descriptor;
+        struct journal_head *jh;
+        struct buffer_head *descriptor;
        struct buffer_head **wbuf = journal->j_wbuf;
        int bufs;
        int flags;
@@ -392,6 +393,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        tid_t first_tid;
        int update_tail;
        int csum_size = 0;
+        LIST_HEAD(io_bufs);
+        LIST_HEAD(log_bufs);
        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                csum_size = sizeof(struct jbd2_journal_block_tail);
@@ -424,13 +427,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        J_ASSERT(journal->j_committing_transaction == NULL);
        commit_transaction = journal->j_running_transaction;
-        J_ASSERT(commit_transaction->t_state == T_RUNNING);
        trace_jbd2_start_commit(journal, commit_transaction);
        jbd_debug(1, "JBD2: starting commit of transaction %d\n",
                        commit_transaction->t_tid);
        write_lock(&journal->j_state_lock);
+        J_ASSERT(commit_transaction->t_state == T_RUNNING);
        commit_transaction->t_state = T_LOCKED;
        trace_jbd2_commit_locking(journal, commit_transaction);
@@ -520,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         */
        jbd2_journal_switch_revoke_table(journal);
+        /*
+         * Reserved credits cannot be claimed anymore, free them
+         */
+        atomic_sub(atomic_read(&journal->j_reserved_credits),
+                   &commit_transaction->t_outstanding_credits);
        trace_jbd2_commit_flushing(journal, commit_transaction);
        stats.run.rs_flushing = jiffies;
        stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -533,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
        write_unlock(&journal->j_state_lock);
-        jbd_debug(3, "JBD2: commit phase 2\n");
+        jbd_debug(3, "JBD2: commit phase 2a\n");
        /*
         * Now start flushing things to disk, in the order they appear
@@ -545,10 +554,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        blk_start_plug(&plug);
        jbd2_journal_write_revoke_records(journal, commit_transaction,
-                                          WRITE_SYNC);
+                                          &log_bufs, WRITE_SYNC);
        blk_finish_plug(&plug);
-        jbd_debug(3, "JBD2: commit phase 2\n");
+        jbd_debug(3, "JBD2: commit phase 2b\n");
        /*
         * Way to go: we have now written out all of the data for a
@@ -571,8 +580,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                 atomic_read(&commit_transaction->t_outstanding_credits));
        err = 0;
-        descriptor = NULL;
        bufs = 0;
+        descriptor = NULL;
        blk_start_plug(&plug);
        while (commit_transaction->t_buffers) {
@@ -604,8 +613,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                   record the metadata buffer. */
                if (!descriptor) {
-                        struct buffer_head *bh;
                        J_ASSERT (bufs == 0);
                        jbd_debug(4, "JBD2: get descriptor\n");
@@ -616,26 +623,26 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                                continue;
                        }
-                        bh = jh2bh(descriptor);
                        jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
-                                (unsigned long long)bh->b_blocknr, bh->b_data);
+                                (unsigned long long)descriptor->b_blocknr,
-                        header = (journal_header_t *)&bh->b_data[0];
+                                descriptor->b_data);
+                        header = (journal_header_t *)descriptor->b_data;
                        header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
                        header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
                        header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
-                        tagp = &bh->b_data[sizeof(journal_header_t)];
+                        tagp = &descriptor->b_data[sizeof(journal_header_t)];
-                        space_left = bh->b_size - sizeof(journal_header_t);
+                        space_left = descriptor->b_size -
+                                                sizeof(journal_header_t);
                        first_tag = 1;
-                        set_buffer_jwrite(bh);
+                        set_buffer_jwrite(descriptor);
-                        set_buffer_dirty(bh);
+                        set_buffer_dirty(descriptor);
-                        wbuf[bufs++] = bh;
+                        wbuf[bufs++] = descriptor;
                        /* Record it so that we can wait for IO
                           completion later */
-                        BUFFER_TRACE(bh, "ph3: file as descriptor");
+                        BUFFER_TRACE(descriptor, "ph3: file as descriptor");
-                        jbd2_journal_file_buffer(descriptor, commit_transaction,
+                        jbd2_file_log_bh(&log_bufs, descriptor);
-                                        BJ_LogCtl);
                }
                /* Where is the buffer to be written? */
@@ -658,29 +665,22 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                /* Bump b_count to prevent truncate from stumbling over
                   the shadowed buffer!  @@@ This can go if we ever get
-                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+                   rid of the shadow pairing of buffers. */
                atomic_inc(&jh2bh(jh)->b_count);
-                /* Make a temporary IO buffer with which to write it out
-                   (this will requeue both the metadata buffer and the
-                   temporary IO buffer). new_bh goes on BJ_IO*/
-                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
                /*
-                 * akpm: jbd2_journal_write_metadata_buffer() sets
+                 * Make a temporary IO buffer with which to write it out
-                 * new_bh->b_transaction to commit_transaction.
+                 * (this will requeue the metadata buffer to BJ_Shadow).
-                 * We need to clean this up before we release new_bh
-                 * (which is of type BJ_IO)
                 */
+                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
                JBUFFER_TRACE(jh, "ph3: write metadata");
                flags = jbd2_journal_write_metadata_buffer(commit_transaction,
-                                                      jh, &new_jh, blocknr);
+                                                jh, &wbuf[bufs], blocknr);
                if (flags < 0) {
                        jbd2_journal_abort(journal, flags);
                        continue;
                }
-                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+                jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
-                wbuf[bufs++] = jh2bh(new_jh);
                /* Record the new block's tag in the current descriptor
                   buffer */
@@ -694,10 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                tag = (journal_block_tag_t *) tagp;
                write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
                tag->t_flags = cpu_to_be16(tag_flag);
-                jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
+                jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
                                        commit_transaction->t_tid);
                tagp += tag_bytes;
                space_left -= tag_bytes;
+                bufs++;
                if (first_tag) {
                        memcpy (tagp, journal->j_uuid, 16);
@@ -809,7 +810,7 @@ start_journal_io:
           the log.  Before we can commit it, wait for the IO so far to
           complete.  Control buffers being written are on the
           transaction's t_log_list queue, and metadata buffers are on
-           the t_iobuf_list queue.
+           the io_bufs list.
           Wait for the buffers in reverse order.  That way we are
           less likely to be woken up until all IOs have completed, and
@@ -818,47 +819,33 @@ start_journal_io:
        jbd_debug(3, "JBD2: commit phase 3\n");
-        /*
+        while (!list_empty(&io_bufs)) {
-         * akpm: these are BJ_IO, and j_list_lock is not needed.
+                struct buffer_head *bh = list_entry(io_bufs.prev,
-         * See __journal_try_to_free_buffer.
+                                                    struct buffer_head,
-         */
+                                                    b_assoc_buffers);
-wait_for_iobuf:
-        while (commit_transaction->t_iobuf_list != NULL) {
-                struct buffer_head *bh;
-                jh = commit_transaction->t_iobuf_list->b_tprev;
+                wait_on_buffer(bh);
-                bh = jh2bh(jh);
+                cond_resched();
-                if (buffer_locked(bh)) {
-                        wait_on_buffer(bh);
-                        goto wait_for_iobuf;
-                }
-                if (cond_resched())
-                        goto wait_for_iobuf;
                if (unlikely(!buffer_uptodate(bh)))
                        err = -EIO;
+                jbd2_unfile_log_bh(bh);
-                clear_buffer_jwrite(bh);
-                JBUFFER_TRACE(jh, "ph4: unfile after journal write");
-                jbd2_journal_unfile_buffer(journal, jh);
                /*
-                 * ->t_iobuf_list should contain only dummy buffer_heads
+                 * The list contains temporary buffer heads created by
-                 * which were created by jbd2_journal_write_metadata_buffer().
+                 * jbd2_journal_write_metadata_buffer().
                 */
                BUFFER_TRACE(bh, "dumping temporary bh");
-                jbd2_journal_put_journal_head(jh);
                __brelse(bh);
                J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
                free_buffer_head(bh);
-                /* We also have to unlock and free the corresponding
+                /* We also have to refile the corresponding shadowed buffer */
-                   shadowed buffer */
                jh = commit_transaction->t_shadow_list->b_tprev;
                bh = jh2bh(jh);
-                clear_bit(BH_JWrite, &bh->b_state);
+                clear_buffer_jwrite(bh);
                J_ASSERT_BH(bh, buffer_jbddirty(bh));
+                J_ASSERT_BH(bh, !buffer_shadow(bh));
                /* The metadata is now released for reuse, but we need
                   to remember it against this transaction so that when
@@ -866,14 +853,6 @@ wait_for_iobuf:
                   required. */
                JBUFFER_TRACE(jh, "file as BJ_Forget");
                jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
-                /*
-                 * Wake up any transactions which were waiting for this IO to
-                 * complete. The barrier must be here so that changes by
-                 * jbd2_journal_file_buffer() take effect before wake_up_bit()
-                 * does the waitqueue check.
-                 */
-                smp_mb();
-                wake_up_bit(&bh->b_state, BH_Unshadow);
                JBUFFER_TRACE(jh, "brelse shadowed buffer");
                __brelse(bh);
        }
@@ -883,26 +862,19 @@ wait_for_iobuf:
        jbd_debug(3, "JBD2: commit phase 4\n");
        /* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
+        while (!list_empty(&log_bufs)) {
-        while (commit_transaction->t_log_list != NULL) {
                struct buffer_head *bh;
-                jh = commit_transaction->t_log_list->b_tprev;
+                bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
-                bh = jh2bh(jh);
+                wait_on_buffer(bh);
-                if (buffer_locked(bh)) {
+                cond_resched();
-                        wait_on_buffer(bh);
-                        goto wait_for_ctlbuf;
-                }
-                if (cond_resched())
-                        goto wait_for_ctlbuf;
                if (unlikely(!buffer_uptodate(bh)))
                        err = -EIO;
                BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
                clear_buffer_jwrite(bh);
-                jbd2_journal_unfile_buffer(journal, jh);
+                jbd2_unfile_log_bh(bh);
-                jbd2_journal_put_journal_head(jh);
                __brelse(bh);           /* One for getblk */
                /* AKPM: bforget here */
        }
@@ -952,9 +924,7 @@ wait_for_iobuf:
        J_ASSERT(list_empty(&commit_transaction->t_inode_list));
        J_ASSERT(commit_transaction->t_buffers == NULL);
        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
-        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
        J_ASSERT(commit_transaction->t_shadow_list == NULL);
-        J_ASSERT(commit_transaction->t_log_list == NULL);
 restart_loop:
        /*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 95457576e434..02c7ad9d7a41 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -103,6 +103,24 @@ EXPORT_SYMBOL(jbd2_inode_cache);
 static void __journal_abort_soft (journal_t *journal, int errno);
 static int jbd2_journal_create_slab(size_t slab_size);
+#ifdef CONFIG_JBD2_DEBUG
+void __jbd2_debug(int level, const char *file, const char *func,
+                  unsigned int line, const char *fmt, ...)
+{
+        struct va_format vaf;
+        va_list args;
+        if (level > jbd2_journal_enable_debug)
+                return;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+        va_end(args);
+}
+EXPORT_SYMBOL(__jbd2_debug);
+#endif
 /* Checksumming functions */
 int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
 {
@@ -310,14 +328,12 @@ static void journal_kill_thread(journal_t *journal)
 *
 * If the source buffer has already been modified by a new transaction
 * since we took the last commit snapshot, we use the frozen copy of
- * that data for IO.  If we end up using the existing buffer_head's data
+ * that data for IO. If we end up using the existing buffer_head's data
- * for the write, then we *have* to lock the buffer to prevent anyone
+ * for the write, then we have to make sure nobody modifies it while the
- * else from using and possibly modifying it while the IO is in
+ * IO is in progress. do_get_write_access() handles this.
- * progress.
 *
- * The function returns a pointer to the buffer_heads to be used for IO.
+ * The function returns a pointer to the buffer_head to be used for IO.
- *
+ * 
- * We assume that the journal has already been locked in this function.
 *
 * Return value:
 *  <0: Error
@@ -330,15 +346,14 @@ static void journal_kill_thread(journal_t *journal)
 int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
                                  struct journal_head  *jh_in,
-                                  struct journal_head **jh_out,
+                                  struct buffer_head **bh_out,
-                                  unsigned long long blocknr)
+                                  sector_t blocknr)
 {
        int need_copy_out = 0;
        int done_copy_out = 0;
        int do_escape = 0;
        char *mapped_data;
        struct buffer_head *new_bh;
-        struct journal_head *new_jh;
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
@@ -368,14 +383,13 @@ retry_alloc:
        /* keep subsequent assertions sane */
        atomic_set(&new_bh->b_count, 1);
-        new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
+        jbd_lock_bh_state(bh_in);
+repeat:
        /*
         * If a new transaction has already done a buffer copy-out, then
         * we use that version of the data for the commit.
         */
-        jbd_lock_bh_state(bh_in);
-repeat:
        if (jh_in->b_frozen_data) {
                done_copy_out = 1;
                new_page = virt_to_page(jh_in->b_frozen_data);
@@ -415,7 +429,7 @@ repeat:
                jbd_unlock_bh_state(bh_in);
                tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
                if (!tmp) {
-                        jbd2_journal_put_journal_head(new_jh);
+                        brelse(new_bh);
                        return -ENOMEM;
                }
                jbd_lock_bh_state(bh_in);
@@ -426,7 +440,7 @@ repeat:
                jh_in->b_frozen_data = tmp;
                mapped_data = kmap_atomic(new_page);
-                memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+                memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
                kunmap_atomic(mapped_data);
                new_page = virt_to_page(tmp);
@@ -452,14 +466,14 @@ repeat:
        }
        set_bh_page(new_bh, new_page, new_offset);
-        new_jh->b_transaction = NULL;
+        new_bh->b_size = bh_in->b_size;
-        new_bh->b_size = jh2bh(jh_in)->b_size;
+        new_bh->b_bdev = journal->j_dev;
-        new_bh->b_bdev = transaction->t_journal->j_dev;
        new_bh->b_blocknr = blocknr;
+        new_bh->b_private = bh_in;
        set_buffer_mapped(new_bh);
        set_buffer_dirty(new_bh);
-        *jh_out = new_jh;
+        *bh_out = new_bh;
        /*
         * The to-be-written buffer needs to get moved to the io queue,
@@ -470,11 +484,9 @@ repeat:
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
        spin_unlock(&journal->j_list_lock);
+        set_buffer_shadow(bh_in);
        jbd_unlock_bh_state(bh_in);
-        JBUFFER_TRACE(new_jh, "file as BJ_IO");
-        jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
        return do_escape | (done_copy_out << 1);
 }
@@ -484,35 +496,6 @@ repeat:
 */
 /*
- * __jbd2_log_space_left: Return the number of free blocks left in the journal.
- *
- * Called with the journal already locked.
- *
- * Called under j_state_lock
- */
-int __jbd2_log_space_left(journal_t *journal)
-{
-        int left = journal->j_free;
-        /* assert_spin_locked(&journal->j_state_lock); */
-        /*
-         * Be pessimistic here about the number of those free blocks which
-         * might be required for log descriptor control blocks.
-         */
-#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
-        left -= MIN_LOG_RESERVED_BLOCKS;
-        if (left <= 0)
-                return 0;
-        left -= (left >> 3);
-        return left;
-}
-/*
 * Called with j_state_lock locked for writing.
 * Returns true if a transaction commit was started.
 */
@@ -564,20 +547,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
 }
 /*
- * Force and wait upon a commit if the calling process is not within
+ * Force and wait any uncommitted transactions.  We can only force the running
- * transaction.  This is used for forcing out undo-protected data which contains
+ * transaction if we don't have an active handle, otherwise, we will deadlock.
- * bitmaps, when the fs is running out of space.
+ * Returns: <0 in case of error,
- *
+ *           0 if nothing to commit,
- * We can only force the running transaction if we don't have an active handle;
+ *           1 if transaction was successfully committed.
- * otherwise, we will deadlock.
- *
- * Returns true if a transaction was started.
 */
-int jbd2_journal_force_commit_nested(journal_t *journal)
+static int __jbd2_journal_force_commit(journal_t *journal)
 {
        transaction_t *transaction = NULL;
        tid_t tid;
-        int need_to_start = 0;
+        int need_to_start = 0, ret = 0;
        read_lock(&journal->j_state_lock);
        if (journal->j_running_transaction && !current->journal_info) {
@@ -588,16 +568,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
                transaction = journal->j_committing_transaction;
        if (!transaction) {
+                /* Nothing to commit */
                read_unlock(&journal->j_state_lock);
-                return 0;       /* Nothing to retry */
+                return 0;
        }
        tid = transaction->t_tid;
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
                jbd2_log_start_commit(journal, tid);
-        jbd2_log_wait_commit(journal, tid);
+        ret = jbd2_log_wait_commit(journal, tid);
-        return 1;
+        if (!ret)
+                ret = 1;
+        return ret;
+}
+/**
+ * Force and wait upon a commit if the calling process is not within
+ * transaction.  This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * @journal: journal to force
+ * Returns true if progress was made.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+        int ret;
+        ret = __jbd2_journal_force_commit(journal);
+        return ret > 0;
+}
+/**
+ * int journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * Caller want unconditional commit. We can only force the running transaction
+ * if we don't have an active handle, otherwise, we will deadlock.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+        int ret;
+        J_ASSERT(!current->journal_info);
+        ret = __jbd2_journal_force_commit(journal);
+        if (ret > 0)
+                ret = 0;
+        return ret;
 }
 /*
@@ -798,7 +815,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
 * But we don't bother doing that, so there will be coherency problems with
 * mmaps of blockdevs which hold live JBD-controlled filesystems.
 */
-struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
 {
        struct buffer_head *bh;
        unsigned long long blocknr;
@@ -817,7 +834,7 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
        BUFFER_TRACE(bh, "return this buffer");
-        return jbd2_journal_add_journal_head(bh);
+        return bh;
 }
 /*
@@ -1062,11 +1079,10 @@ static journal_t * journal_init_common (void)
                return NULL;
        init_waitqueue_head(&journal->j_wait_transaction_locked);
-        init_waitqueue_head(&journal->j_wait_logspace);
        init_waitqueue_head(&journal->j_wait_done_commit);
-        init_waitqueue_head(&journal->j_wait_checkpoint);
        init_waitqueue_head(&journal->j_wait_commit);
        init_waitqueue_head(&journal->j_wait_updates);
+        init_waitqueue_head(&journal->j_wait_reserved);
        mutex_init(&journal->j_barrier);
        mutex_init(&journal->j_checkpoint_mutex);
        spin_lock_init(&journal->j_revoke_lock);
@@ -1076,6 +1092,7 @@ static journal_t * journal_init_common (void)
        journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
        journal->j_min_batch_time = 0;
        journal->j_max_batch_time = 15000; /* 15ms */
+        atomic_set(&journal->j_reserved_credits, 0);
        /* The journal is marked for error until we succeed with recovery! */
        journal->j_flags = JBD2_ABORT;
@@ -1318,6 +1335,7 @@ static int journal_reset(journal_t *journal)
 static void jbd2_write_superblock(journal_t *journal, int write_op)
 {
        struct buffer_head *bh = journal->j_sb_buffer;
+        journal_superblock_t *sb = journal->j_superblock;
        int ret;
        trace_jbd2_write_superblock(journal, write_op);
@@ -1339,6 +1357,7 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
                clear_buffer_write_io_error(bh);
                set_buffer_uptodate(bh);
        }
+        jbd2_superblock_csum_set(journal, sb);
        get_bh(bh);
        bh->b_end_io = end_buffer_write_sync;
        ret = submit_bh(write_op, bh);
@@ -1435,7 +1454,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
        jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
                  journal->j_errno);
        sb->s_errno    = cpu_to_be32(journal->j_errno);
-        jbd2_superblock_csum_set(journal, sb);
        read_unlock(&journal->j_state_lock);
        jbd2_write_superblock(journal, WRITE_SYNC);
@@ -2325,13 +2343,13 @@ static struct journal_head *journal_alloc_journal_head(void)
 #ifdef CONFIG_JBD2_DEBUG
        atomic_inc(&nr_journal_heads);
 #endif
-        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+        ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
        if (!ret) {
                jbd_debug(1, "out of memory for journal_head\n");
                pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
                while (!ret) {
                        yield();
-                        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+                        ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
                }
        }
        return ret;
@@ -2393,10 +2411,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
        struct journal_head *new_jh = NULL;
 repeat:
-        if (!buffer_jbd(bh)) {
+        if (!buffer_jbd(bh))
                new_jh = journal_alloc_journal_head();
-                memset(new_jh, 0, sizeof(*new_jh));
-        }
        jbd_lock_bh_journal_head(bh);
        if (buffer_jbd(bh)) {
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 626846bac32f..d4851464b57e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -399,18 +399,17 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
 static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
                                      void *buf, __u32 sequence)
 {
-        __u32 provided, calculated;
+        __u32 csum32;
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return 1;
        sequence = cpu_to_be32(sequence);
-        calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+        csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
-                                 sizeof(sequence));
+                             sizeof(sequence));
-        calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize);
+        csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
-        provided = be32_to_cpu(tag->t_checksum);
-        return provided == cpu_to_be32(calculated);
+        return tag->t_checksum == cpu_to_be16(csum32);
 }
 static int do_one_pass(journal_t *journal,
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f30b80b4ce8b..198c9c10276d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,9 +122,10 @@ struct jbd2_revoke_table_s
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
-                                    struct journal_head **, int *,
+                                    struct list_head *,
+                                    struct buffer_head **, int *,
                                    struct jbd2_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int, int);
+static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
 #endif
 /* Utility functions to maintain the revoke table */
@@ -531,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
 */
 void jbd2_journal_write_revoke_records(journal_t *journal,
                                       transaction_t *transaction,
+                                       struct list_head *log_bufs,
                                       int write_op)
 {
-        struct journal_head *descriptor;
+        struct buffer_head *descriptor;
        struct jbd2_revoke_record_s *record;
        struct jbd2_revoke_table_s *revoke;
        struct list_head *hash_list;
@@ -553,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
                while (!list_empty(hash_list)) {
                        record = (struct jbd2_revoke_record_s *)
                                hash_list->next;
-                        write_one_revoke_record(journal, transaction,
+                        write_one_revoke_record(journal, transaction, log_bufs,
                                                &descriptor, &offset,
                                                record, write_op);
                        count++;
@@ -573,13 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
 static void write_one_revoke_record(journal_t *journal,
                                    transaction_t *transaction,
-                                    struct journal_head **descriptorp,
+                                    struct list_head *log_bufs,
+                                    struct buffer_head **descriptorp,
                                    int *offsetp,
                                    struct jbd2_revoke_record_s *record,
                                    int write_op)
 {
        int csum_size = 0;
-        struct journal_head *descriptor;
+        struct buffer_head *descriptor;
        int offset;
        journal_header_t *header;
@@ -609,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal,
                descriptor = jbd2_journal_get_descriptor_buffer(journal);
                if (!descriptor)
                        return;
-                header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+                header = (journal_header_t *)descriptor->b_data;
                header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
                header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
                header->h_sequence  = cpu_to_be32(transaction->t_tid);
                /* Record it so that we can wait for IO completion later */
-                JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
+                BUFFER_TRACE(descriptor, "file in log_bufs");
-                jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+                jbd2_file_log_bh(log_bufs, descriptor);
                offset = sizeof(jbd2_journal_revoke_header_t);
                *descriptorp = descriptor;
        }
        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
-                * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
+                * ((__be64 *)(&descriptor->b_data[offset])) =
                        cpu_to_be64(record->blocknr);
                offset += 8;
        } else {
-                * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+                * ((__be32 *)(&descriptor->b_data[offset])) =
                        cpu_to_be32(record->blocknr);
                offset += 4;
        }
@@ -636,8 +639,7 @@ static void write_one_revoke_record(journal_t *journal,
        *offsetp = offset;
 }
-static void jbd2_revoke_csum_set(journal_t *j,
+static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
-                                 struct journal_head *descriptor)
 {
        struct jbd2_journal_revoke_tail *tail;
        __u32 csum;
@@ -645,12 +647,10 @@ static void jbd2_revoke_csum_set(journal_t *j,
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return;
-        tail = (struct jbd2_journal_revoke_tail *)
+        tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
-                        (jh2bh(descriptor)->b_data + j->j_blocksize -
                        sizeof(struct jbd2_journal_revoke_tail));
        tail->r_checksum = 0;
-        csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+        csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
-                           j->j_blocksize);
        tail->r_checksum = cpu_to_be32(csum);
 }
@@ -662,25 +662,24 @@ static void jbd2_revoke_csum_set(journal_t *j,
 */
 static void flush_descriptor(journal_t *journal,
-                             struct journal_head *descriptor,
+                             struct buffer_head *descriptor,
                             int offset, int write_op)
 {
        jbd2_journal_revoke_header_t *header;
-        struct buffer_head *bh = jh2bh(descriptor);
        if (is_journal_aborted(journal)) {
-                put_bh(bh);
+                put_bh(descriptor);
                return;
        }
-        header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+        header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
        header->r_count = cpu_to_be32(offset);
        jbd2_revoke_csum_set(journal, descriptor);
-        set_buffer_jwrite(bh);
+        set_buffer_jwrite(descriptor);
-        BUFFER_TRACE(bh, "write");
+        BUFFER_TRACE(descriptor, "write");
-        set_buffer_dirty(bh);
+        set_buffer_dirty(descriptor);
-        write_dirty_buffer(bh, write_op);
+        write_dirty_buffer(descriptor, write_op);
 }
 #endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 10f524c59ea8..7aa9a32573bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -89,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
        atomic_set(&transaction->t_updates, 0);
-        atomic_set(&transaction->t_outstanding_credits, 0);
+        atomic_set(&transaction->t_outstanding_credits,
+                   atomic_read(&journal->j_reserved_credits));
        atomic_set(&transaction->t_handle_count, 0);
        INIT_LIST_HEAD(&transaction->t_inode_list);
        INIT_LIST_HEAD(&transaction->t_private_list);
@@ -141,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction,
 }
 /*
+ * Wait until running transaction passes T_LOCKED state. Also starts the commit
+ * if needed. The function expects running transaction to exist and releases
+ * j_state_lock.
+ */
+static void wait_transaction_locked(journal_t *journal)
+        __releases(journal->j_state_lock)
+{
+        DEFINE_WAIT(wait);
+        int need_to_start;
+        tid_t tid = journal->j_running_transaction->t_tid;
+        prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+                        TASK_UNINTERRUPTIBLE);
+        need_to_start = !tid_geq(journal->j_commit_request, tid);
+        read_unlock(&journal->j_state_lock);
+        if (need_to_start)
+                jbd2_log_start_commit(journal, tid);
+        schedule();
+        finish_wait(&journal->j_wait_transaction_locked, &wait);
+}
+static void sub_reserved_credits(journal_t *journal, int blocks)
+{
+        atomic_sub(blocks, &journal->j_reserved_credits);
+        wake_up(&journal->j_wait_reserved);
+}
+/*
+ * Wait until we can add credits for handle to the running transaction.  Called
+ * with j_state_lock held for reading. Returns 0 if handle joined the running
+ * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
+ * caller must retry.
+ */
+static int add_transaction_credits(journal_t *journal, int blocks,
+                                   int rsv_blocks)
+{
+        transaction_t *t = journal->j_running_transaction;
+        int needed;
+        int total = blocks + rsv_blocks;
+        /*
+         * If the current transaction is locked down for commit, wait
+         * for the lock to be released.
+         */
+        if (t->t_state == T_LOCKED) {
+                wait_transaction_locked(journal);
+                return 1;
+        }
+        /*
+         * If there is not enough space left in the log to write all
+         * potential buffers requested by this operation, we need to
+         * stall pending a log checkpoint to free some more log space.
+         */
+        needed = atomic_add_return(total, &t->t_outstanding_credits);
+        if (needed > journal->j_max_transaction_buffers) {
+                /*
+                 * If the current transaction is already too large,
+                 * then start to commit it: we can then go back and
+                 * attach this handle to a new transaction.
+                 */
+                atomic_sub(total, &t->t_outstanding_credits);
+                wait_transaction_locked(journal);
+                return 1;
+        }
+        /*
+         * The commit code assumes that it can get enough log space
+         * without forcing a checkpoint.  This is *critical* for
+         * correctness: a checkpoint of a buffer which is also
+         * associated with a committing transaction creates a deadlock,
+         * so commit simply cannot force through checkpoints.
+         *
+         * We must therefore ensure the necessary space in the journal
+         * *before* starting to dirty potentially checkpointed buffers
+         * in the new transaction.
+         */
+        if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+                atomic_sub(total, &t->t_outstanding_credits);
+                read_unlock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
+                if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+                        __jbd2_log_wait_for_space(journal);
+                write_unlock(&journal->j_state_lock);
+                return 1;
+        }
+        /* No reservation? We are done... */
+        if (!rsv_blocks)
+                return 0;
+        needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
+        /* We allow at most half of a transaction to be reserved */
+        if (needed > journal->j_max_transaction_buffers / 2) {
+                sub_reserved_credits(journal, rsv_blocks);
+                atomic_sub(total, &t->t_outstanding_credits);
+                read_unlock(&journal->j_state_lock);
+                wait_event(journal->j_wait_reserved,
+                         atomic_read(&journal->j_reserved_credits) + rsv_blocks
+                         <= journal->j_max_transaction_buffers / 2);
+                return 1;
+        }
+        return 0;
+}
+/*
 * start_this_handle: Given a handle, deal with any locking or stalling
 * needed to make sure that there is enough journal space for the handle
 * to begin.  Attach the handle to a transaction and set up the
@@ -151,18 +258,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
                             gfp_t gfp_mask)
 {
        transaction_t   *transaction, *new_transaction = NULL;
-        tid_t           tid;
+        int             blocks = handle->h_buffer_credits;
-        int             needed, need_to_start;
+        int             rsv_blocks = 0;
-        int             nblocks = handle->h_buffer_credits;
        unsigned long ts = jiffies;
-        if (nblocks > journal->j_max_transaction_buffers) {
+        /*
+         * 1/2 of transaction can be reserved so we can practically handle
+         * only 1/2 of maximum transaction size per operation
+         */
+        if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
                printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
-                       current->comm, nblocks,
+                       current->comm, blocks,
-                       journal->j_max_transaction_buffers);
+                       journal->j_max_transaction_buffers / 2);
                return -ENOSPC;
        }
+        if (handle->h_rsv_handle)
+                rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
 alloc_transaction:
        if (!journal->j_running_transaction) {
                new_transaction = kmem_cache_zalloc(transaction_cache,
@@ -199,8 +312,12 @@ repeat:
                return -EROFS;
        }
-        /* Wait on the journal's transaction barrier if necessary */
+        /*
-        if (journal->j_barrier_count) {
+         * Wait on the journal's transaction barrier if necessary. Specifically
+         * we allow reserved handles to proceed because otherwise commit could
+         * deadlock on page writeback not being able to complete.
+         */
+        if (!handle->h_reserved && journal->j_barrier_count) {
                read_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_transaction_locked,
                                journal->j_barrier_count == 0);
@@ -213,7 +330,7 @@ repeat:
                        goto alloc_transaction;
                write_lock(&journal->j_state_lock);
                if (!journal->j_running_transaction &&
-                    !journal->j_barrier_count) {
+                    (handle->h_reserved || !journal->j_barrier_count)) {
                        jbd2_get_transaction(journal, new_transaction);
                        new_transaction = NULL;
                }
@@ -223,85 +340,18 @@ repeat:
        transaction = journal->j_running_transaction;
-        /*
+        if (!handle->h_reserved) {
-         * If the current transaction is locked down for commit, wait for the
+                /* We may have dropped j_state_lock - restart in that case */
-         * lock to be released.
+                if (add_transaction_credits(journal, blocks, rsv_blocks))
-         */
+                        goto repeat;
-        if (transaction->t_state == T_LOCKED) {
+        } else {
-                DEFINE_WAIT(wait);
-                prepare_to_wait(&journal->j_wait_transaction_locked,
-                                        &wait, TASK_UNINTERRUPTIBLE);
-                read_unlock(&journal->j_state_lock);
-                schedule();
-                finish_wait(&journal->j_wait_transaction_locked, &wait);
-                goto repeat;
-        }
-        /*
-         * If there is not enough space left in the log to write all potential
-         * buffers requested by this operation, we need to stall pending a log
-         * checkpoint to free some more log space.
-         */
-        needed = atomic_add_return(nblocks,
-                                   &transaction->t_outstanding_credits);
-        if (needed > journal->j_max_transaction_buffers) {
                /*
-                 * If the current transaction is already too large, then start
+                 * We have handle reserved so we are allowed to join T_LOCKED
-                 * to commit it: we can then go back and attach this handle to
+                 * transaction and we don't have to check for transaction size
-                 * a new transaction.
+                 * and journal space.
                 */
-                DEFINE_WAIT(wait);
+                sub_reserved_credits(journal, blocks);
+                handle->h_reserved = 0;
-                jbd_debug(2, "Handle %p starting new commit...\n", handle);
-                atomic_sub(nblocks, &transaction->t_outstanding_credits);
-                prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
-                                TASK_UNINTERRUPTIBLE);
-                tid = transaction->t_tid;
-                need_to_start = !tid_geq(journal->j_commit_request, tid);
-                read_unlock(&journal->j_state_lock);
-                if (need_to_start)
-                        jbd2_log_start_commit(journal, tid);
-                schedule();
-                finish_wait(&journal->j_wait_transaction_locked, &wait);
-                goto repeat;
-        }
-        /*
-         * The commit code assumes that it can get enough log space
-         * without forcing a checkpoint.  This is *critical* for
-         * correctness: a checkpoint of a buffer which is also
-         * associated with a committing transaction creates a deadlock,
-         * so commit simply cannot force through checkpoints.
-         *
-         * We must therefore ensure the necessary space in the journal
-         * *before* starting to dirty potentially checkpointed buffers
-         * in the new transaction.
-         *
-         * The worst part is, any transaction currently committing can
-         * reduce the free space arbitrarily.  Be careful to account for
-         * those buffers when checkpointing.
-         */
-        /*
-         * @@@ AKPM: This seems rather over-defensive.  We're giving commit
-         * a _lot_ of headroom: 1/4 of the journal plus the size of
-         * the committing transaction.  Really, we only need to give it
-         * committing_transaction->t_outstanding_credits plus "enough" for
-         * the log control blocks.
-         * Also, this test is inconsistent with the matching one in
-         * jbd2_journal_extend().
-         */
-        if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
-                jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
-                atomic_sub(nblocks, &transaction->t_outstanding_credits);
-                read_unlock(&journal->j_state_lock);
-                write_lock(&journal->j_state_lock);
-                if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
-                        __jbd2_log_wait_for_space(journal);
-                write_unlock(&journal->j_state_lock);
-                goto repeat;
        }
        /* OK, account for the buffers that this operation expects to
@@ -309,15 +359,16 @@ repeat:
         */
        update_t_max_wait(transaction, ts);
        handle->h_transaction = transaction;
-        handle->h_requested_credits = nblocks;
+        handle->h_requested_credits = blocks;
        handle->h_start_jiffies = jiffies;
        atomic_inc(&transaction->t_updates);
        atomic_inc(&transaction->t_handle_count);
-        jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+        jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
-                  handle, nblocks,
+                  handle, blocks,
                  atomic_read(&transaction->t_outstanding_credits),
-                  __jbd2_log_space_left(journal));
+                  jbd2_log_space_left(journal));
        read_unlock(&journal->j_state_lock);
+        current->journal_info = handle;
        lock_map_acquire(&handle->h_lockdep_map);
        jbd2_journal_free_transaction(new_transaction);
@@ -348,16 +399,21 @@ static handle_t *new_handle(int nblocks)
 *
 * We make sure that the transaction can guarantee at least nblocks of
 * modified buffers in the log.  We block until the log can guarantee
- * that much space.
+ * that much space. Additionally, if rsv_blocks > 0, we also create another
- *
+ * handle with rsv_blocks reserved blocks in the journal. This handle is
- * This function is visible to journal users (like ext3fs), so is not
+ * is stored in h_rsv_handle. It is not attached to any particular transaction
- * called with the journal already locked.
+ * and thus doesn't block transaction commit. If the caller uses this reserved
+ * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
+ * on the parent handle will dispose the reserved one. Reserved handle has to
+ * be converted to a normal handle using jbd2_journal_start_reserved() before
+ * it can be used.
 *
 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
 * on failure.
 */
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
-                              unsigned int type, unsigned int line_no)
+                              gfp_t gfp_mask, unsigned int type,
+                              unsigned int line_no)
 {
        handle_t *handle = journal_current_handle();
        int err;
@@ -374,13 +430,24 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
        handle = new_handle(nblocks);
        if (!handle)
                return ERR_PTR(-ENOMEM);
+        if (rsv_blocks) {
+                handle_t *rsv_handle;
-        current->journal_info = handle;
+                rsv_handle = new_handle(rsv_blocks);
+                if (!rsv_handle) {
+                        jbd2_free_handle(handle);
+                        return ERR_PTR(-ENOMEM);
+                }
+                rsv_handle->h_reserved = 1;
+                rsv_handle->h_journal = journal;
+                handle->h_rsv_handle = rsv_handle;
+        }
        err = start_this_handle(journal, handle, gfp_mask);
        if (err < 0) {
+                if (handle->h_rsv_handle)
+                        jbd2_free_handle(handle->h_rsv_handle);
                jbd2_free_handle(handle);
-                current->journal_info = NULL;
                return ERR_PTR(err);
        }
        handle->h_type = type;
@@ -395,10 +462,65 @@ EXPORT_SYMBOL(jbd2__journal_start);
 handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 {
-        return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0);
+        return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
 }
 EXPORT_SYMBOL(jbd2_journal_start);
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+        journal_t *journal = handle->h_journal;
+        WARN_ON(!handle->h_reserved);
+        sub_reserved_credits(journal, handle->h_buffer_credits);
+        jbd2_free_handle(handle);
+}
+EXPORT_SYMBOL(jbd2_journal_free_reserved);
+/**
+ * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
+ * @handle: handle to start
+ *
+ * Start handle that has been previously reserved with jbd2_journal_reserve().
+ * This attaches @handle to the running transaction (or creates one if there's
+ * not transaction running). Unlike jbd2_journal_start() this function cannot
+ * block on journal commit, checkpointing, or similar stuff. It can block on
+ * memory allocation or frozen journal though.
+ *
+ * Return 0 on success, non-zero on error - handle is freed in that case.
+ */
+int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
+                                unsigned int line_no)
+{
+        journal_t *journal = handle->h_journal;
+        int ret = -EIO;
+        if (WARN_ON(!handle->h_reserved)) {
+                /* Someone passed in normal handle? Just stop it. */
+                jbd2_journal_stop(handle);
+                return ret;
+        }
+        /*
+         * Usefulness of mixing of reserved and unreserved handles is
+         * questionable. So far nobody seems to need it so just error out.
+         */
+        if (WARN_ON(current->journal_info)) {
+                jbd2_journal_free_reserved(handle);
+                return ret;
+        }
+        handle->h_journal = NULL;
+        /*
+         * GFP_NOFS is here because callers are likely from writeback or
+         * similarly constrained call sites
+         */
+        ret = start_this_handle(journal, handle, GFP_NOFS);
+        if (ret < 0)
+                jbd2_journal_free_reserved(handle);
+        handle->h_type = type;
+        handle->h_line_no = line_no;
+        return ret;
+}
+EXPORT_SYMBOL(jbd2_journal_start_reserved);
 /**
 * int jbd2_journal_extend() - extend buffer credits.
@@ -423,49 +545,53 @@ EXPORT_SYMBOL(jbd2_journal_start);
 int jbd2_journal_extend(handle_t *handle, int nblocks)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
        int result;
        int wanted;
-        result = -EIO;
+        WARN_ON(!transaction);
        if (is_handle_aborted(handle))
-                goto out;
+                return -EROFS;
+        journal = transaction->t_journal;
        result = 1;
        read_lock(&journal->j_state_lock);
        /* Don't extend a locked-down transaction! */
-        if (handle->h_transaction->t_state != T_RUNNING) {
+        if (transaction->t_state != T_RUNNING) {
                jbd_debug(3, "denied handle %p %d blocks: "
                          "transaction not running\n", handle, nblocks);
                goto error_out;
        }
        spin_lock(&transaction->t_handle_lock);
-        wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
+        wanted = atomic_add_return(nblocks,
+                                   &transaction->t_outstanding_credits);
        if (wanted > journal->j_max_transaction_buffers) {
                jbd_debug(3, "denied handle %p %d blocks: "
                          "transaction too large\n", handle, nblocks);
+                atomic_sub(nblocks, &transaction->t_outstanding_credits);
                goto unlock;
        }
-        if (wanted > __jbd2_log_space_left(journal)) {
+        if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
+            jbd2_log_space_left(journal)) {
                jbd_debug(3, "denied handle %p %d blocks: "
                          "insufficient log space\n", handle, nblocks);
+                atomic_sub(nblocks, &transaction->t_outstanding_credits);
                goto unlock;
        }
        trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
-                                 handle->h_transaction->t_tid,
+                                 transaction->t_tid,
                                 handle->h_type, handle->h_line_no,
                                 handle->h_buffer_credits,
                                 nblocks);
        handle->h_buffer_credits += nblocks;
        handle->h_requested_credits += nblocks;
-        atomic_add(nblocks, &transaction->t_outstanding_credits);
        result = 0;
        jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -473,7 +599,6 @@ unlock:
        spin_unlock(&transaction->t_handle_lock);
 error_out:
        read_unlock(&journal->j_state_lock);
-out:
        return result;
 }
@@ -490,19 +615,22 @@ out:
 * to a running handle, a call to jbd2_journal_restart will commit the
 * handle's transaction so far and reattach the handle to a new
 * transaction capabable of guaranteeing the requested number of
- * credits.
+ * credits. We preserve reserved handle if there's any attached to the
+ * passed in handle.
 */
 int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
        tid_t           tid;
        int             need_to_start, ret;
+        WARN_ON(!transaction);
        /* If we've had an abort of any type, don't even think about
         * actually doing the restart! */
        if (is_handle_aborted(handle))
                return 0;
+        journal = transaction->t_journal;
        /*
         * First unlink the handle from its current transaction, and start the
@@ -515,12 +643,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
        spin_lock(&transaction->t_handle_lock);
        atomic_sub(handle->h_buffer_credits,
                   &transaction->t_outstanding_credits);
+        if (handle->h_rsv_handle) {
+                sub_reserved_credits(journal,
+                                     handle->h_rsv_handle->h_buffer_credits);
+        }
        if (atomic_dec_and_test(&transaction->t_updates))
                wake_up(&journal->j_wait_updates);
+        tid = transaction->t_tid;
        spin_unlock(&transaction->t_handle_lock);
+        handle->h_transaction = NULL;
+        current->journal_info = NULL;
        jbd_debug(2, "restarting handle %p\n", handle);
-        tid = transaction->t_tid;
        need_to_start = !tid_geq(journal->j_commit_request, tid);
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
@@ -557,6 +691,14 @@ void jbd2_journal_lock_updates(journal_t *journal)
        write_lock(&journal->j_state_lock);
        ++journal->j_barrier_count;
+        /* Wait until there are no reserved handles */
+        if (atomic_read(&journal->j_reserved_credits)) {
+                write_unlock(&journal->j_state_lock);
+                wait_event(journal->j_wait_reserved,
+                           atomic_read(&journal->j_reserved_credits) == 0);
+                write_lock(&journal->j_state_lock);
+        }
        /* Wait until there are no running updates */
        while (1) {
                transaction_t *transaction = journal->j_running_transaction;
@@ -619,6 +761,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
+static int sleep_on_shadow_bh(void *word)
+{
+        io_schedule();
+        return 0;
+}
 /*
 * If the buffer is already part of the current transaction, then there
 * is nothing we need to do.  If it is already part of a prior
@@ -634,17 +782,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
                        int force_copy)
 {
        struct buffer_head *bh;
-        transaction_t *transaction;
+        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        int error;
        char *frozen_buffer = NULL;
        int need_copy = 0;
        unsigned long start_lock, time_lock;
+        WARN_ON(!transaction);
        if (is_handle_aborted(handle))
                return -EROFS;
-        transaction = handle->h_transaction;
        journal = transaction->t_journal;
        jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -754,41 +901,29 @@ repeat:
                 * journaled.  If the primary copy is already going to
                 * disk then we cannot do copy-out here. */
-                if (jh->b_jlist == BJ_Shadow) {
+                if (buffer_shadow(bh)) {
-                        DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
-                        wait_queue_head_t *wqh;
-                        wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
                        JBUFFER_TRACE(jh, "on shadow: sleep");
                        jbd_unlock_bh_state(bh);
-                        /* commit wakes up all shadow buffers after IO */
+                        wait_on_bit(&bh->b_state, BH_Shadow,
-                        for ( ; ; ) {
+                                    sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
-                                prepare_to_wait(wqh, &wait.wait,
-                                                TASK_UNINTERRUPTIBLE);
-                                if (jh->b_jlist != BJ_Shadow)
-                                        break;
-                                schedule();
-                        }
-                        finish_wait(wqh, &wait.wait);
                        goto repeat;
                }
-                /* Only do the copy if the currently-owning transaction
+                /*
-                 * still needs it.  If it is on the Forget list, the
+                 * Only do the copy if the currently-owning transaction still
-                 * committing transaction is past that stage.  The
+                 * needs it. If buffer isn't on BJ_Metadata list, the
-                 * buffer had better remain locked during the kmalloc,
+                 * committing transaction is past that stage (here we use the
-                 * but that should be true --- we hold the journal lock
+                 * fact that BH_Shadow is set under bh_state lock together with
-                 * still and the buffer is already on the BUF_JOURNAL
+                 * refiling to BJ_Shadow list and at this point we know the
-                 * list so won't be flushed.
+                 * buffer doesn't have BH_Shadow set).
                 *
                 * Subtle point, though: if this is a get_undo_access,
                 * then we will be relying on the frozen_data to contain
                 * the new value of the committed_data record after the
                 * transaction, so we HAVE to force the frozen_data copy
-                 * in that case. */
+                 * in that case.
+                 */
-                if (jh->b_jlist != BJ_Forget || force_copy) {
+                if (jh->b_jlist == BJ_Metadata || force_copy) {
                        JBUFFER_TRACE(jh, "generate frozen data");
                        if (!frozen_buffer) {
                                JBUFFER_TRACE(jh, "allocate memory for buffer");
@@ -915,14 +1050,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
 int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
        struct journal_head *jh = jbd2_journal_add_journal_head(bh);
        int err;
        jbd_debug(5, "journal_head %p\n", jh);
+        WARN_ON(!transaction);
        err = -EROFS;
        if (is_handle_aborted(handle))
                goto out;
+        journal = transaction->t_journal;
        err = 0;
        JBUFFER_TRACE(jh, "entry");
@@ -1128,12 +1265,14 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
 int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
        struct journal_head *jh;
        int ret = 0;
+        WARN_ON(!transaction);
        if (is_handle_aborted(handle))
-                goto out;
+                return -EROFS;
+        journal = transaction->t_journal;
        jh = jbd2_journal_grab_journal_head(bh);
        if (!jh) {
                ret = -EUCLEAN;
@@ -1227,7 +1366,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
        JBUFFER_TRACE(jh, "file as BJ_Metadata");
        spin_lock(&journal->j_list_lock);
-        __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+        __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
        spin_unlock(&journal->j_list_lock);
 out_unlock_bh:
        jbd_unlock_bh_state(bh);
@@ -1258,12 +1397,17 @@ out:
 int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
        struct journal_head *jh;
        int drop_reserve = 0;
        int err = 0;
        int was_modified = 0;
+        WARN_ON(!transaction);
+        if (is_handle_aborted(handle))
+                return -EROFS;
+        journal = transaction->t_journal;
        BUFFER_TRACE(bh, "entry");
        jbd_lock_bh_state(bh);
@@ -1290,7 +1434,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
         */
        jh->b_modified = 0;
-        if (jh->b_transaction == handle->h_transaction) {
+        if (jh->b_transaction == transaction) {
                J_ASSERT_JH(jh, !jh->b_frozen_data);
                /* If we are forgetting a buffer which is already part
@@ -1385,19 +1529,21 @@ drop:
 int jbd2_journal_stop(handle_t *handle)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
-        int err, wait_for_commit = 0;
+        int err = 0, wait_for_commit = 0;
        tid_t tid;
        pid_t pid;
+        if (!transaction)
+                goto free_and_exit;
+        journal = transaction->t_journal;
        J_ASSERT(journal_current_handle() == handle);
        if (is_handle_aborted(handle))
                err = -EIO;
-        else {
+        else
                J_ASSERT(atomic_read(&transaction->t_updates) > 0);
-                err = 0;
-        }
        if (--handle->h_ref > 0) {
                jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1407,7 +1553,7 @@ int jbd2_journal_stop(handle_t *handle)
        jbd_debug(4, "Handle %p going down\n", handle);
        trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
-                                handle->h_transaction->t_tid,
+                                transaction->t_tid,
                                handle->h_type, handle->h_line_no,
                                jiffies - handle->h_start_jiffies,
                                handle->h_sync, handle->h_requested_credits,
@@ -1518,33 +1664,13 @@ int jbd2_journal_stop(handle_t *handle)
        lock_map_release(&handle->h_lockdep_map);
+        if (handle->h_rsv_handle)
+                jbd2_journal_free_reserved(handle->h_rsv_handle);
+free_and_exit:
        jbd2_free_handle(handle);
        return err;
 }
-/**
- * int jbd2_journal_force_commit() - force any uncommitted transactions
- * @journal: journal to force
- *
- * For synchronous operations: force any uncommitted transactions
- * to disk.  May seem kludgy, but it reuses all the handle batching
- * code in a very simple manner.
- */
-int jbd2_journal_force_commit(journal_t *journal)
-{
-        handle_t *handle;
-        int ret;
-        handle = jbd2_journal_start(journal, 1);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-        } else {
-                handle->h_sync = 1;
-                ret = jbd2_journal_stop(handle);
-        }
-        return ret;
-}
 /*
 *
 * List management code snippets: various functions for manipulating the
@@ -1601,10 +1727,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
 * Remove a buffer from the appropriate transaction list.
 *
 * Note that this function can *change* the value of
- * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
- * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
+ * t_reserved_list.  If the caller is holding onto a copy of one of these
- * of these pointers, it could go bad.  Generally the caller needs to re-read
+ * pointers, it could go bad.  Generally the caller needs to re-read the
- * the pointer from the transaction_t.
+ * pointer from the transaction_t.
 *
 * Called under j_list_lock.
 */
@@ -1634,15 +1760,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
        case BJ_Forget:
                list = &transaction->t_forget;
                break;
-        case BJ_IO:
-                list = &transaction->t_iobuf_list;
-                break;
        case BJ_Shadow:
                list = &transaction->t_shadow_list;
                break;
-        case BJ_LogCtl:
-                list = &transaction->t_log_list;
-                break;
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
@@ -2034,18 +2154,23 @@ zap_buffer_unlocked:
 * void jbd2_journal_invalidatepage()
 * @journal: journal to use for flush...
 * @page:    page to flush
- * @offset:  length of page to invalidate.
+ * @offset:  start of the range to invalidate
+ * @length:  length of the range to invalidate
 *
- * Reap page buffers containing data after offset in page. Can return -EBUSY
+ * Reap page buffers containing data after in the specified range in page.
- * if buffers are part of the committing transaction and the page is straddling
+ * Can return -EBUSY if buffers are part of the committing transaction and
- * i_size. Caller then has to wait for current commit and try again.
+ * the page is straddling i_size. Caller then has to wait for current commit
+ * and try again.
 */
 int jbd2_journal_invalidatepage(journal_t *journal,
                                struct page *page,
-                                unsigned long offset)
+                                unsigned int offset,
+                                unsigned int length)
 {
        struct buffer_head *head, *bh, *next;
+        unsigned int stop = offset + length;
        unsigned int curr_off = 0;
+        int partial_page = (offset || length < PAGE_CACHE_SIZE);
        int may_free = 1;
        int ret = 0;
@@ -2054,6 +2179,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
        if (!page_has_buffers(page))
                return 0;
+        BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
        /* We will potentially be playing with lists other than just the
         * data lists (especially for journaled data mode), so be
         * cautious in our locking. */
@@ -2063,10 +2190,13 @@ int jbd2_journal_invalidatepage(journal_t *journal,
                unsigned int next_off = curr_off + bh->b_size;
                next = bh->b_this_page;
+                if (next_off > stop)
+                        return 0;
                if (offset <= curr_off) {
                        /* This block is wholly outside the truncation point */
                        lock_buffer(bh);
-                        ret = journal_unmap_buffer(journal, bh, offset > 0);
+                        ret = journal_unmap_buffer(journal, bh, partial_page);
                        unlock_buffer(bh);
                        if (ret < 0)
                                return ret;
@@ -2077,7 +2207,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
        } while (bh != head);
-        if (!offset) {
+        if (!partial_page) {
                if (may_free && try_to_free_buffers(page))
                        J_ASSERT(!page_has_buffers(page));
        }
@@ -2138,15 +2268,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
        case BJ_Forget:
                list = &transaction->t_forget;
                break;
-        case BJ_IO:
-                list = &transaction->t_iobuf_list;
-                break;
        case BJ_Shadow:
                list = &transaction->t_shadow_list;
                break;
-        case BJ_LogCtl:
-                list = &transaction->t_log_list;
-                break;
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
@@ -2248,10 +2372,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
 {
        transaction_t *transaction = handle->h_transaction;
-        journal_t *journal = transaction->t_journal;
+        journal_t *journal;
+        WARN_ON(!transaction);
        if (is_handle_aborted(handle))
-                return -EIO;
+                return -EROFS;
+        journal = transaction->t_journal;
        jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
                        transaction->t_tid);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index acd46a4160cb..e3aac222472e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -22,7 +22,7 @@
 #include <linux/time.h>
 #include "nodelist.h"
-static int jffs2_readdir (struct file *, void *, filldir_t);
+static int jffs2_readdir (struct file *, struct dir_context *);
 static int jffs2_create (struct inode *,struct dentry *,umode_t,
                         bool);
@@ -40,7 +40,7 @@ static int jffs2_rename (struct inode *, struct dentry *,
 const struct file_operations jffs2_dir_operations =
 {
        .read =         generic_read_dir,
-        .readdir =      jffs2_readdir,
+        .iterate =      jffs2_readdir,
        .unlocked_ioctl=jffs2_ioctl,
        .fsync =        jffs2_fsync,
        .llseek =       generic_file_llseek,
@@ -114,60 +114,40 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
 /***********************************************************************/
-static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int jffs2_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct jffs2_inode_info *f;
+        struct inode *inode = file_inode(file);
-        struct inode *inode = file_inode(filp);
+        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        struct jffs2_full_dirent *fd;
-        unsigned long offset, curofs;
+        unsigned long curofs = 1;
-        jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n",
+        jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino);
-                  file_inode(filp)->i_ino);
-        f = JFFS2_INODE_INFO(inode);
+        if (!dir_emit_dots(file, ctx))
+                return 0;
-        offset = filp->f_pos;
-        if (offset == 0) {
-                jffs2_dbg(1, "Dirent 0: \".\", ino #%lu\n", inode->i_ino);
-                if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-                        goto out;
-                offset++;
-        }
-        if (offset == 1) {
-                unsigned long pino = parent_ino(filp->f_path.dentry);
-                jffs2_dbg(1, "Dirent 1: \"..\", ino #%lu\n", pino);
-                if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0)
-                        goto out;
-                offset++;
-        }
-        curofs=1;
        mutex_lock(&f->sem);
        for (fd = f->dents; fd; fd = fd->next) {
                curofs++;
-                /* First loop: curofs = 2; offset = 2 */
+                /* First loop: curofs = 2; pos = 2 */
-                if (curofs < offset) {
+                if (curofs < ctx->pos) {
                        jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n",
-                                  fd->name, fd->ino, fd->type, curofs, offset);
+                                  fd->name, fd->ino, fd->type, curofs, (unsigned long)ctx->pos);
                        continue;
                }
                if (!fd->ino) {
                        jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n",
                                  fd->name);
-                        offset++;
+                        ctx->pos++;
                        continue;
                }
                jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n",
-                          offset, fd->name, fd->ino, fd->type);
+                          (unsigned long)ctx->pos, fd->name, fd->ino, fd->type);
-                if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0)
+                if (!dir_emit(ctx, fd->name, strlen(fd->name), fd->ino, fd->type))
                        break;
-                offset++;
+                ctx->pos++;
        }
        mutex_unlock(&f->sem);
- out:
-        filp->f_pos = offset;
        return 0;
 }
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 9a55f53be5ff..370d7b6c5942 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -346,8 +346,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
                printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
                       (unsigned long long) blkno,
                       (unsigned long long) nblocks);
-                jfs_error(ip->i_sb,
+                jfs_error(ip->i_sb, "block to be freed is outside the map\n");
-                          "dbFree: block to be freed is outside the map");
                return -EIO;
        }
@@ -384,7 +383,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
                /* free the blocks. */
                if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
-                        jfs_error(ip->i_sb, "dbFree: error in block map\n");
+                        jfs_error(ip->i_sb, "error in block map\n");
                        release_metapage(mp);
                        IREAD_UNLOCK(ipbmap);
                        return (rc);
@@ -441,8 +440,7 @@ dbUpdatePMap(struct inode *ipbmap,
                printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
                       (unsigned long long) blkno,
                       (unsigned long long) nblocks);
-                jfs_error(ipbmap->i_sb,
+                jfs_error(ipbmap->i_sb, "blocks are outside the map\n");
-                          "dbUpdatePMap: blocks are outside the map");
                return -EIO;
        }
@@ -726,7 +724,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
        /* the hint should be within the map */
        if (hint >= mapSize) {
-                jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map");
+                jfs_error(ip->i_sb, "the hint is outside the map\n");
                return -EIO;
        }
@@ -1057,8 +1055,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
        bmp = sbi->bmap;
        if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
                IREAD_UNLOCK(ipbmap);
-                jfs_error(ip->i_sb,
+                jfs_error(ip->i_sb, "the block is outside the filesystem\n");
-                          "dbExtend: the block is outside the filesystem");
                return -EIO;
        }
@@ -1134,8 +1131,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
        u32 mask;
        if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
-                jfs_error(bmp->db_ipbmap->i_sb,
+                jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
-                          "dbAllocNext: Corrupt dmap page");
                return -EIO;
        }
@@ -1265,8 +1261,7 @@ dbAllocNear(struct bmap * bmp,
        s8 *leaf;
        if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
-                jfs_error(bmp->db_ipbmap->i_sb,
+                jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
-                          "dbAllocNear: Corrupt dmap page");
                return -EIO;
        }
@@ -1381,8 +1376,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
         */
        if (l2nb > bmp->db_agl2size) {
                jfs_error(bmp->db_ipbmap->i_sb,
-                          "dbAllocAG: allocation request is larger than the "
+                          "allocation request is larger than the allocation group size\n");
-                          "allocation group size");
                return -EIO;
        }
@@ -1417,7 +1411,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
                               (unsigned long long) blkno,
                               (unsigned long long) nblocks);
                        jfs_error(bmp->db_ipbmap->i_sb,
-                                  "dbAllocAG: dbAllocCtl failed in free AG");
+                                  "dbAllocCtl failed in free AG\n");
                }
                return (rc);
        }
@@ -1433,8 +1427,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
        budmin = dcp->budmin;
        if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
-                jfs_error(bmp->db_ipbmap->i_sb,
+                jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
-                          "dbAllocAG: Corrupt dmapctl page");
                release_metapage(mp);
                return -EIO;
        }
@@ -1475,7 +1468,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
                        }
                        if (n == 4) {
                                jfs_error(bmp->db_ipbmap->i_sb,
-                                          "dbAllocAG: failed descending stree");
+                                          "failed descending stree\n");
                                release_metapage(mp);
                                return -EIO;
                        }
@@ -1515,8 +1508,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
                                       &blkno))) {
                                if (rc == -ENOSPC) {
                                        jfs_error(bmp->db_ipbmap->i_sb,
-                                                  "dbAllocAG: control page "
+                                                  "control page inconsistent\n");
-                                                  "inconsistent");
                                        return -EIO;
                                }
                                return (rc);
@@ -1528,7 +1520,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
                rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
                if (rc == -ENOSPC) {
                        jfs_error(bmp->db_ipbmap->i_sb,
-                                  "dbAllocAG: unable to allocate blocks");
+                                  "unable to allocate blocks\n");
                        rc = -EIO;
                }
                return (rc);
@@ -1587,8 +1579,7 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
         */
        rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
        if (rc == -ENOSPC) {
-                jfs_error(bmp->db_ipbmap->i_sb,
+                jfs_error(bmp->db_ipbmap->i_sb, "unable to allocate blocks\n");
-                          "dbAllocAny: unable to allocate blocks");
                return -EIO;
        }
        return (rc);
@@ -1652,8 +1643,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
        range_cnt = min_t(u64, max_ranges + 1, 32 * 1024);
        totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS);
        if (totrim == NULL) {
-                jfs_error(bmp->db_ipbmap->i_sb,
+                jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n");
-                          "dbDiscardAG: no memory for trim array");
                IWRITE_UNLOCK(ipbmap);
                return 0;
        }
@@ -1682,8 +1672,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
                        nblocks = 1 << l2nb;
                } else {
                        /* Trim any already allocated blocks */
-                        jfs_error(bmp->db_ipbmap->i_sb,
+                        jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n");
-                                "dbDiscardAG: -EIO");
                        break;
                }
@@ -1761,7 +1750,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
                if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
                        jfs_error(bmp->db_ipbmap->i_sb,
-                                  "dbFindCtl: Corrupt dmapctl page");
+                                  "Corrupt dmapctl page\n");
                        release_metapage(mp);
                        return -EIO;
                }
@@ -1782,7 +1771,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
                if (rc) {
                        if (lev != level) {
                                jfs_error(bmp->db_ipbmap->i_sb,
-                                          "dbFindCtl: dmap inconsistent");
+                                          "dmap inconsistent\n");
                                return -EIO;
                        }
                        return -ENOSPC;
@@ -1906,7 +1895,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
                if (dp->tree.stree[ROOT] != L2BPERDMAP) {
                        release_metapage(mp);
                        jfs_error(bmp->db_ipbmap->i_sb,
-                                  "dbAllocCtl: the dmap is not all free");
+                                  "the dmap is not all free\n");
                        rc = -EIO;
                        goto backout;
                }
@@ -1953,7 +1942,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
                         * to indicate that we have leaked blocks.
                         */
                        jfs_error(bmp->db_ipbmap->i_sb,
-                                  "dbAllocCtl: I/O Error: Block Leakage.");
+                                  "I/O Error: Block Leakage\n");
                        continue;
                }
                dp = (struct dmap *) mp->data;
@@ -1965,8 +1954,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
                         * to indicate that we have leaked blocks.
                         */
                        release_metapage(mp);
-                        jfs_error(bmp->db_ipbmap->i_sb,
+                        jfs_error(bmp->db_ipbmap->i_sb, "Block Leakage\n");
-                                  "dbAllocCtl: Block Leakage.");
                        continue;
                }
@@ -2263,8 +2251,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
                        for (; nwords > 0; nwords -= nw) {
                                if (leaf[word] < BUDMIN) {
                                        jfs_error(bmp->db_ipbmap->i_sb,
-                                                  "dbAllocBits: leaf page "
+                                                  "leaf page corrupt\n");
-                                                  "corrupt");
                                        break;
                                }
@@ -2536,8 +2523,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
        dcp = (struct dmapctl *) mp->data;
        if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
-                jfs_error(bmp->db_ipbmap->i_sb,
+                jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
-                          "dbAdjCtl: Corrupt dmapctl page");
                release_metapage(mp);
                return -EIO;
        }
@@ -2638,8 +2624,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
                        assert(level == bmp->db_maxlevel);
                        if (bmp->db_maxfreebud != oldroot) {
                                jfs_error(bmp->db_ipbmap->i_sb,
-                                          "dbAdjCtl: the maximum free buddy is "
+                                          "the maximum free buddy is not the old root\n");
-                                          "not the old root");
                        }
                        bmp->db_maxfreebud = dcp->stree[ROOT];
                }
@@ -3481,7 +3466,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
        p = BMAPBLKNO + nbperpage;      /* L2 page */
        l2mp = read_metapage(ipbmap, p, PSIZE, 0);
        if (!l2mp) {
-                jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read");
+                jfs_error(ipbmap->i_sb, "L2 page could not be read\n");
                return -EIO;
        }
        l2dcp = (struct dmapctl *) l2mp->data;
@@ -3646,8 +3631,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
                }
        }                       /* for each L1 in a L2 */
-        jfs_error(ipbmap->i_sb,
+        jfs_error(ipbmap->i_sb, "function has not returned as expected\n");
-                  "dbExtendFS: function has not returned as expected");
 errout:
        if (l0mp)
                release_metapage(l0mp);
@@ -3717,7 +3701,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
                }
                if (bmp->db_agpref >= bmp->db_numag) {
                        jfs_error(ipbmap->i_sb,
-                                  "cannot find ag with average freespace");
+                                  "cannot find ag with average freespace\n");
                }
        }
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0ddbeceafc62..8743ba9c6742 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -124,21 +124,21 @@ struct dtsplit {
 #define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
 /* get page buffer for specified block address */
-#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
+#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)                             \
-{\
+do {                                                                    \
-        BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\
+        BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot);        \
-        if (!(RC))\
+        if (!(RC)) {                                                    \
-        {\
+                if (((P)->header.nextindex >                            \
-                if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\
+                     (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
-                    ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\
+                    ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) {  \
-                {\
+                        BT_PUTPAGE(MP);                                 \
-                        BT_PUTPAGE(MP);\
+                        jfs_error((IP)->i_sb,                           \
-                        jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\
+                                  "DT_GETPAGE: dtree page corrupt\n");  \
-                        MP = NULL;\
+                        MP = NULL;                                      \
-                        RC = -EIO;\
+                        RC = -EIO;                                      \
-                }\
+                }                                                       \
-        }\
+        }                                                               \
-}
+} while (0)
 /* for consistency */
 #define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -776,7 +776,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
                        /* Something's corrupted, mark filesystem dirty so
                         * chkdsk will fix it.
                         */
-                        jfs_error(sb, "stack overrun in dtSearch!");
+                        jfs_error(sb, "stack overrun!\n");
                        BT_STACK_DUMP(btstack);
                        rc = -EIO;
                        goto out;
@@ -3002,9 +3002,9 @@ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
 * return: offset = (pn, index) of start entry
 *      of next jfs_readdir()/dtRead()
 */
-int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int jfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *ip = file_inode(filp);
+        struct inode *ip = file_inode(file);
        struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
        int rc = 0;
        loff_t dtpos;   /* legacy OS/2 style position */
@@ -3033,7 +3033,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int overflow, fix_page, page_fixed = 0;
        static int unique_pos = 2;      /* If we can't fix broken index */
-        if (filp->f_pos == DIREND)
+        if (ctx->pos == DIREND)
                return 0;
        if (DO_INDEX(ip)) {
@@ -3045,7 +3045,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                 */
                do_index = 1;
-                dir_index = (u32) filp->f_pos;
+                dir_index = (u32) ctx->pos;
                if (dir_index > 1) {
                        struct dir_table_slot dirtab_slot;
@@ -3053,25 +3053,25 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        if (dtEmpty(ip) ||
                            (dir_index >= JFS_IP(ip)->next_index)) {
                                /* Stale position.  Directory has shrunk */
-                                filp->f_pos = DIREND;
+                                ctx->pos = DIREND;
                                return 0;
                        }
                      repeat:
                        rc = read_index(ip, dir_index, &dirtab_slot);
                        if (rc) {
-                                filp->f_pos = DIREND;
+                                ctx->pos = DIREND;
                                return rc;
                        }
                        if (dirtab_slot.flag == DIR_INDEX_FREE) {
                                if (loop_count++ > JFS_IP(ip)->next_index) {
                                        jfs_err("jfs_readdir detected "
                                                   "infinite loop!");
-                                        filp->f_pos = DIREND;
+                                        ctx->pos = DIREND;
                                        return 0;
                                }
                                dir_index = le32_to_cpu(dirtab_slot.addr2);
                                if (dir_index == -1) {
-                                        filp->f_pos = DIREND;
+                                        ctx->pos = DIREND;
                                        return 0;
                                }
                                goto repeat;
@@ -3080,13 +3080,13 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        index = dirtab_slot.slot;
                        DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
                        if (rc) {
-                                filp->f_pos = DIREND;
+                                ctx->pos = DIREND;
                                return 0;
                        }
                        if (p->header.flag & BT_INTERNAL) {
                                jfs_err("jfs_readdir: bad index table");
                                DT_PUTPAGE(mp);
-                                filp->f_pos = -1;
+                                ctx->pos = -1;
                                return 0;
                        }
                } else {
@@ -3094,23 +3094,22 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                /*
                                 * self "."
                                 */
-                                filp->f_pos = 0;
+                                ctx->pos = 0;
-                                if (filldir(dirent, ".", 1, 0, ip->i_ino,
+                                if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
-                                            DT_DIR))
                                        return 0;
                        }
                        /*
                         * parent ".."
                         */
-                        filp->f_pos = 1;
+                        ctx->pos = 1;
-                        if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR))
+                        if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
                                return 0;
                        /*
                         * Find first entry of left-most leaf
                         */
                        if (dtEmpty(ip)) {
-                                filp->f_pos = DIREND;
+                                ctx->pos = DIREND;
                                return 0;
                        }
@@ -3128,23 +3127,19 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                 * pn > 0:              Real entries, pn=1 -> leftmost page
                 * pn = index = -1:     No more entries
                 */
-                dtpos = filp->f_pos;
+                dtpos = ctx->pos;
                if (dtpos == 0) {
                        /* build "." entry */
+                        if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
-                        if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
-                                    DT_DIR))
                                return 0;
                        dtoffset->index = 1;
-                        filp->f_pos = dtpos;
+                        ctx->pos = dtpos;
                }
                if (dtoffset->pn == 0) {
                        if (dtoffset->index == 1) {
                                /* build ".." entry */
+                                if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
-                                if (filldir(dirent, "..", 2, filp->f_pos,
-                                            PARENT(ip), DT_DIR))
                                        return 0;
                        } else {
                                jfs_err("jfs_readdir called with "
@@ -3152,18 +3147,18 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        }
                        dtoffset->pn = 1;
                        dtoffset->index = 0;
-                        filp->f_pos = dtpos;
+                        ctx->pos = dtpos;
                }
                if (dtEmpty(ip)) {
-                        filp->f_pos = DIREND;
+                        ctx->pos = DIREND;
                        return 0;
                }
-                if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) {
+                if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
                        jfs_err("jfs_readdir: unexpected rc = %d "
                                "from dtReadNext", rc);
-                        filp->f_pos = DIREND;
+                        ctx->pos = DIREND;
                        return 0;
                }
                /* get start leaf page and index */
@@ -3171,7 +3166,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                /* offset beyond directory eof ? */
                if (bn < 0) {
-                        filp->f_pos = DIREND;
+                        ctx->pos = DIREND;
                        return 0;
                }
        }
@@ -3180,7 +3175,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (dirent_buf == 0) {
                DT_PUTPAGE(mp);
                jfs_warn("jfs_readdir: __get_free_page failed!");
-                filp->f_pos = DIREND;
+                ctx->pos = DIREND;
                return -ENOMEM;
        }
@@ -3252,8 +3247,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                /* Sanity Check */
                                if (d_namleft == 0) {
                                        jfs_error(ip->i_sb,
-                                                  "JFS:Dtree error: ino = "
+                                                  "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n",
-                                                  "%ld, bn=%Ld, index = %d",
                                                  (long)ip->i_ino,
                                                  (long long)bn,
                                                  i);
@@ -3295,9 +3289,9 @@ skip_one:
                jfs_dirent = (struct jfs_dirent *) dirent_buf;
                while (jfs_dirents--) {
-                        filp->f_pos = jfs_dirent->position;
+                        ctx->pos = jfs_dirent->position;
-                        if (filldir(dirent, jfs_dirent->name,
+                        if (!dir_emit(ctx, jfs_dirent->name,
-                                    jfs_dirent->name_len, filp->f_pos,
+                                    jfs_dirent->name_len,
                                    jfs_dirent->ino, DT_UNKNOWN))
                                goto out;
                        jfs_dirent = next_jfs_dirent(jfs_dirent);
@@ -3309,7 +3303,7 @@ skip_one:
                }
                if (!overflow && (bn == 0)) {
-                        filp->f_pos = DIREND;
+                        ctx->pos = DIREND;
                        break;
                }
@@ -3373,7 +3367,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
                 */
                if (BT_STACK_FULL(btstack)) {
                        DT_PUTPAGE(mp);
-                        jfs_error(ip->i_sb, "dtReadFirst: btstack overrun");
+                        jfs_error(ip->i_sb, "btstack overrun\n");
                        BT_STACK_DUMP(btstack);
                        return -EIO;
                }
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 2545bb317235..fd4169e6e698 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -265,5 +265,5 @@ extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key,
 extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key,
                    ino_t * orig_ino, ino_t new_ino, int flag);
-extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir);
+extern int jfs_readdir(struct file *file, struct dir_context *ctx);
 #endif                          /* !_H_JFS_DTREE */
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index e5fe8506ed16..2ae7d59ab10a 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -388,7 +388,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
        if ((rc == 0) && xlen) {
                if (xlen != nbperpage) {
-                        jfs_error(ip->i_sb, "extHint: corrupt xtree");
+                        jfs_error(ip->i_sb, "corrupt xtree\n");
                        rc = -EIO;
                }
                XADaddress(xp, xaddr);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f7e042b63ddb..f321986e73d2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -386,7 +386,7 @@ int diRead(struct inode *ip)
        dp += rel_inode;
        if (ip->i_ino != le32_to_cpu(dp->di_number)) {
-                jfs_error(ip->i_sb, "diRead: i_ino != di_number");
+                jfs_error(ip->i_sb, "i_ino != di_number\n");
                rc = -EIO;
        } else if (le32_to_cpu(dp->di_nlink) == 0)
                rc = -ESTALE;
@@ -625,7 +625,7 @@ int diWrite(tid_t tid, struct inode *ip)
        if (!addressPXD(&(jfs_ip->ixpxd)) ||
            (lengthPXD(&(jfs_ip->ixpxd)) !=
             JFS_IP(ipimap)->i_imap->im_nbperiext)) {
-                jfs_error(ip->i_sb, "diWrite: ixpxd invalid");
+                jfs_error(ip->i_sb, "ixpxd invalid\n");
                return -EIO;
        }
@@ -893,8 +893,7 @@ int diFree(struct inode *ip)
        if (iagno >= imap->im_nextiag) {
                print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
                               imap, 32, 0);
-                jfs_error(ip->i_sb,
+                jfs_error(ip->i_sb, "inum = %d, iagno = %d, nextiag = %d\n",
-                          "diFree: inum = %d, iagno = %d, nextiag = %d",
                          (uint) inum, iagno, imap->im_nextiag);
                return -EIO;
        }
@@ -930,15 +929,14 @@ int diFree(struct inode *ip)
        mask = HIGHORDER >> bitno;
        if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
-                jfs_error(ip->i_sb,
+                jfs_error(ip->i_sb, "wmap shows inode already free\n");
-                          "diFree: wmap shows inode already free");
        }
        if (!addressPXD(&iagp->inoext[extno])) {
                release_metapage(mp);
                IREAD_UNLOCK(ipimap);
                AG_UNLOCK(imap, agno);
-                jfs_error(ip->i_sb, "diFree: invalid inoext");
+                jfs_error(ip->i_sb, "invalid inoext\n");
                return -EIO;
        }
@@ -950,7 +948,7 @@ int diFree(struct inode *ip)
                release_metapage(mp);
                IREAD_UNLOCK(ipimap);
                AG_UNLOCK(imap, agno);
-                jfs_error(ip->i_sb, "diFree: numfree > numinos");
+                jfs_error(ip->i_sb, "numfree > numinos\n");
                return -EIO;
        }
        /*
@@ -1199,7 +1197,7 @@ int diFree(struct inode *ip)
         * for the inode being freed.
         */
        if (iagp->pmap[extno] != 0) {
-                jfs_error(ip->i_sb, "diFree: the pmap does not show inode free");
+                jfs_error(ip->i_sb, "the pmap does not show inode free\n");
        }
        iagp->wmap[extno] = 0;
        PXDlength(&iagp->inoext[extno], 0);
@@ -1518,8 +1516,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
                                        release_metapage(mp);
                                        AG_UNLOCK(imap, agno);
                                        jfs_error(ip->i_sb,
-                                                  "diAlloc: can't find free bit "
+                                                  "can't find free bit in wmap\n");
-                                                  "in wmap");
                                        return -EIO;
                                }
@@ -1660,7 +1657,7 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
        numinos = imap->im_agctl[agno].numinos;
        if (numfree > numinos) {
-                jfs_error(ip->i_sb, "diAllocAG: numfree > numinos");
+                jfs_error(ip->i_sb, "numfree > numinos\n");
                return -EIO;
        }
@@ -1811,8 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
        if (!iagp->nfreeinos) {
                IREAD_UNLOCK(imap->im_ipimap);
                release_metapage(mp);
-                jfs_error(ip->i_sb,
+                jfs_error(ip->i_sb, "nfreeinos = 0, but iag on freelist\n");
-                          "diAllocIno: nfreeinos = 0, but iag on freelist");
                return -EIO;
        }
@@ -1824,7 +1820,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
                        IREAD_UNLOCK(imap->im_ipimap);
                        release_metapage(mp);
                        jfs_error(ip->i_sb,
-                                  "diAllocIno: free inode not found in summary map");
+                                  "free inode not found in summary map\n");
                        return -EIO;
                }
@@ -1839,7 +1835,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
        if (rem >= EXTSPERSUM) {
                IREAD_UNLOCK(imap->im_ipimap);
                release_metapage(mp);
-                jfs_error(ip->i_sb, "diAllocIno: no free extent found");
+                jfs_error(ip->i_sb, "no free extent found\n");
                return -EIO;
        }
        extno = (sword << L2EXTSPERSUM) + rem;
@@ -1850,7 +1846,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
        if (rem >= INOSPEREXT) {
                IREAD_UNLOCK(imap->im_ipimap);
                release_metapage(mp);
-                jfs_error(ip->i_sb, "diAllocIno: free inode not found");
+                jfs_error(ip->i_sb, "free inode not found\n");
                return -EIO;
        }
@@ -1936,7 +1932,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
                IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
                if ((rc = diIAGRead(imap, iagno, &mp))) {
                        IREAD_UNLOCK(imap->im_ipimap);
-                        jfs_error(ip->i_sb, "diAllocExt: error reading iag");
+                        jfs_error(ip->i_sb, "error reading iag\n");
                        return rc;
                }
                iagp = (struct iag *) mp->data;
@@ -1948,8 +1944,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
                if (sword >= SMAPSZ) {
                        release_metapage(mp);
                        IREAD_UNLOCK(imap->im_ipimap);
-                        jfs_error(ip->i_sb,
+                        jfs_error(ip->i_sb, "free ext summary map not found\n");
-                                  "diAllocExt: free ext summary map not found");
                        return -EIO;
                }
                if (~iagp->extsmap[sword])
@@ -1962,7 +1957,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
        if (rem >= EXTSPERSUM) {
                release_metapage(mp);
                IREAD_UNLOCK(imap->im_ipimap);
-                jfs_error(ip->i_sb, "diAllocExt: free extent not found");
+                jfs_error(ip->i_sb, "free extent not found\n");
                return -EIO;
        }
        extno = (sword << L2EXTSPERSUM) + rem;
@@ -2081,8 +2076,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
                if (bmp)
                        release_metapage(bmp);
-                jfs_error(imap->im_ipimap->i_sb,
+                jfs_error(imap->im_ipimap->i_sb, "iag inconsistent\n");
-                          "diAllocBit: iag inconsistent");
                return -EIO;
        }
@@ -2189,7 +2183,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
        /* better have free extents.
         */
        if (!iagp->nfreeexts) {
-                jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents");
+                jfs_error(imap->im_ipimap->i_sb, "no free extents\n");
                return -EIO;
        }
@@ -2261,7 +2255,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
                        }
                        if (ciagp == NULL) {
                                jfs_error(imap->im_ipimap->i_sb,
-                                          "diNewExt: ciagp == NULL");
+                                          "ciagp == NULL\n");
                                rc = -EIO;
                                goto error_out;
                        }
@@ -2498,7 +2492,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
                        IWRITE_UNLOCK(ipimap);
                        IAGFREE_UNLOCK(imap);
                        jfs_error(imap->im_ipimap->i_sb,
-                                  "diNewIAG: ipimap->i_size is wrong");
+                                  "ipimap->i_size is wrong\n");
                        return -EIO;
                }
@@ -2758,8 +2752,7 @@ diUpdatePMap(struct inode *ipimap,
        iagno = INOTOIAG(inum);
        /* make sure that the iag is contained within the map */
        if (iagno >= imap->im_nextiag) {
-                jfs_error(ipimap->i_sb,
+                jfs_error(ipimap->i_sb, "the iag is outside the map\n");
-                          "diUpdatePMap: the iag is outside the map");
                return -EIO;
        }
        /* read the iag */
@@ -2788,13 +2781,13 @@ diUpdatePMap(struct inode *ipimap,
                 */
                if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
                        jfs_error(ipimap->i_sb,
-                                  "diUpdatePMap: inode %ld not marked as "
+                                  "inode %ld not marked as allocated in wmap!\n",
-                                  "allocated in wmap!", inum);
+                                  inum);
                }
                if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
                        jfs_error(ipimap->i_sb,
-                                  "diUpdatePMap: inode %ld not marked as "
+                                  "inode %ld not marked as allocated in pmap!\n",
-                                  "allocated in pmap!", inum);
+                                  inum);
                }
                /* update the bitmap for the extent of the freed inode */
                iagp->pmap[extno] &= cpu_to_le32(~mask);
@@ -2809,15 +2802,13 @@ diUpdatePMap(struct inode *ipimap,
                if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
                        release_metapage(mp);
                        jfs_error(ipimap->i_sb,
-                                  "diUpdatePMap: the inode is not allocated in "
+                                  "the inode is not allocated in the working map\n");
-                                  "the working map");
                        return -EIO;
                }
                if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) {
                        release_metapage(mp);
                        jfs_error(ipimap->i_sb,
-                                  "diUpdatePMap: the inode is not free in the "
+                                  "the inode is not free in the persistent map\n");
-                                  "persistent map");
                        return -EIO;
                }
                /* update the bitmap for the extent of the allocated inode */
@@ -2909,8 +2900,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
                iagp = (struct iag *) bp->data;
                if (le32_to_cpu(iagp->iagnum) != i) {
                        release_metapage(bp);
-                        jfs_error(ipimap->i_sb,
+                        jfs_error(ipimap->i_sb, "unexpected value of iagnum\n");
-                                  "diExtendFs: unexpected value of iagnum");
                        return -EIO;
                }
@@ -2986,8 +2976,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
        if (xnuminos != atomic_read(&imap->im_numinos) ||
            xnumfree != atomic_read(&imap->im_numfree)) {
-                jfs_error(ipimap->i_sb,
+                jfs_error(ipimap->i_sb, "numinos or numfree incorrect\n");
-                          "diExtendFs: numinos or numfree incorrect");
                return -EIO;
        }
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 6740d34cd82b..d165cde0c68d 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -571,9 +571,10 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
        return ret;
 }
-static void metapage_invalidatepage(struct page *page, unsigned long offset)
+static void metapage_invalidatepage(struct page *page, unsigned int offset,
+                                    unsigned int length)
 {
-        BUG_ON(offset);
+        BUG_ON(offset || length < PAGE_CACHE_SIZE);
        BUG_ON(PageWriteback(page));
@@ -646,7 +647,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
        if (mp) {
                if (mp->logical_size != size) {
                        jfs_error(inode->i_sb,
-                                  "__get_metapage: mp->logical_size != size");
+                                  "get_mp->logical_size != size\n");
                        jfs_err("logical_size = %d, size = %d",
                                mp->logical_size, size);
                        dump_stack();
@@ -657,8 +658,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
                if (test_bit(META_discard, &mp->flag)) {
                        if (!new) {
                                jfs_error(inode->i_sb,
-                                          "__get_metapage: using a "
+                                          "using a discarded metapage\n");
-                                          "discarded metapage");
                                discard_metapage(mp);
                                goto unlock;
                        }
diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h
index 884fc21ab8ee..04847b8d3070 100644
--- a/fs/jfs/jfs_superblock.h
+++ b/fs/jfs/jfs_superblock.h
@@ -108,6 +108,7 @@ struct jfs_superblock {
 extern int readSuper(struct super_block *, struct buffer_head **);
 extern int updateSuper(struct super_block *, uint);
+__printf(2, 3)
 extern void jfs_error(struct super_block *, const char *, ...);
 extern int jfs_mount(struct super_block *);
 extern int jfs_mount_rw(struct super_block *, int);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 5fcc02eaa64c..564c4f279ac6 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2684,7 +2684,7 @@ void txAbort(tid_t tid, int dirty)
         * mark filesystem dirty
         */
        if (dirty)
-                jfs_error(tblk->sb, "txAbort");
+                jfs_error(tblk->sb, "\n");
        return;
 }
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 6c50871e6220..5ad7748860ce 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -64,22 +64,23 @@
 /* get page buffer for specified block address */
 /* ToDo: Replace this ugly macro with a function */
-#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
+#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)                             \
-{\
+do {                                                                    \
-        BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\
+        BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot);        \
-        if (!(RC))\
+        if (!(RC)) {                                                    \
-        {\
+                if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \
-                if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\
+                    (le16_to_cpu((P)->header.nextindex) >               \
-                    (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\
+                     le16_to_cpu((P)->header.maxentry)) ||              \
-                    (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\
+                    (le16_to_cpu((P)->header.maxentry) >                \
-                {\
+                     (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \
-                        jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\
+                        jfs_error((IP)->i_sb,                           \
-                        BT_PUTPAGE(MP);\
+                                  "XT_GETPAGE: xtree page corrupt\n");  \
-                        MP = NULL;\
+                        BT_PUTPAGE(MP);                                 \
-                        RC = -EIO;\
+                        MP = NULL;                                      \
-                }\
+                        RC = -EIO;                                      \
-        }\
+                }                                                       \
-}
+        }                                                               \
+} while (0)
 /* for consistency */
 #define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -499,7 +500,7 @@ static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
                /* push (bn, index) of the parent page/entry */
                if (BT_STACK_FULL(btstack)) {
-                        jfs_error(ip->i_sb, "stack overrun in xtSearch!");
+                        jfs_error(ip->i_sb, "stack overrun!\n");
                        XT_PUTPAGE(mp);
                        return -EIO;
                }
@@ -1385,7 +1386,7 @@ int xtExtend(tid_t tid,		/* transaction id */
        if (cmp != 0) {
                XT_PUTPAGE(mp);
-                jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent");
+                jfs_error(ip->i_sb, "xtSearch did not find extent\n");
                return -EIO;
        }
@@ -1393,7 +1394,7 @@ int xtExtend(tid_t tid,		/* transaction id */
        xad = &p->xad[index];
        if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
                XT_PUTPAGE(mp);
-                jfs_error(ip->i_sb, "xtExtend: extension is not contiguous");
+                jfs_error(ip->i_sb, "extension is not contiguous\n");
                return -EIO;
        }
@@ -1552,7 +1553,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
        if (cmp != 0) {
                XT_PUTPAGE(mp);
-                jfs_error(ip->i_sb, "xtTailgate: couldn't find extent");
+                jfs_error(ip->i_sb, "couldn't find extent\n");
                return -EIO;
        }
@@ -1560,8 +1561,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
        nextindex = le16_to_cpu(p->header.nextindex);
        if (index != nextindex - 1) {
                XT_PUTPAGE(mp);
-                jfs_error(ip->i_sb,
+                jfs_error(ip->i_sb, "the entry found is not the last entry\n");
-                          "xtTailgate: the entry found is not the last entry");
                return -EIO;
        }
@@ -1734,7 +1734,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
        if (cmp != 0) {
                XT_PUTPAGE(mp);
-                jfs_error(ip->i_sb, "xtUpdate: Could not find extent");
+                jfs_error(ip->i_sb, "Could not find extent\n");
                return -EIO;
        }
@@ -1758,7 +1758,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
            (nxoff + nxlen > xoff + xlen)) {
                XT_PUTPAGE(mp);
                jfs_error(ip->i_sb,
-                          "xtUpdate: nXAD in not completely contained within XAD");
+                          "nXAD in not completely contained within XAD\n");
                return -EIO;
        }
@@ -1907,7 +1907,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
        if (xoff >= nxoff) {
                XT_PUTPAGE(mp);
-                jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff");
+                jfs_error(ip->i_sb, "xoff >= nxoff\n");
                return -EIO;
        }
 /* #endif _JFS_WIP_COALESCE */
@@ -2048,14 +2048,13 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
                if (cmp != 0) {
                        XT_PUTPAGE(mp);
-                        jfs_error(ip->i_sb, "xtUpdate: xtSearch failed");
+                        jfs_error(ip->i_sb, "xtSearch failed\n");
                        return -EIO;
                }
                if (index0 != index) {
                        XT_PUTPAGE(mp);
-                        jfs_error(ip->i_sb,
+                        jfs_error(ip->i_sb, "unexpected value of index\n");
-                                  "xtUpdate: unexpected value of index");
                        return -EIO;
                }
        }
@@ -3650,7 +3649,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
      getChild:
        /* save current parent entry for the child page */
        if (BT_STACK_FULL(&btstack)) {
-                jfs_error(ip->i_sb, "stack overrun in xtTruncate!");
+                jfs_error(ip->i_sb, "stack overrun!\n");
                XT_PUTPAGE(mp);
                return -EIO;
        }
@@ -3751,8 +3750,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
                if (cmp != 0) {
                        XT_PUTPAGE(mp);
-                        jfs_error(ip->i_sb,
+                        jfs_error(ip->i_sb, "did not find extent\n");
-                                  "xtTruncate_pmap: did not find extent");
                        return -EIO;
                }
        } else {
@@ -3851,7 +3849,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
      getChild:
        /* save current parent entry for the child page */
        if (BT_STACK_FULL(&btstack)) {
-                jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!");
+                jfs_error(ip->i_sb, "stack overrun!\n");
                XT_PUTPAGE(mp);
                return -EIO;
        }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 3b91a7ad6086..aa8a3370631b 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1176,7 +1176,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                                if (!S_ISDIR(old_ip->i_mode) && new_ip)
                                        IWRITE_UNLOCK(new_ip);
                                jfs_error(new_ip->i_sb,
-                                          "jfs_rename: new_ip->i_nlink != 0");
+                                          "new_ip->i_nlink != 0\n");
                                return -EIO;
                        }
                        tblk = tid_to_tblock(tid);
@@ -1529,7 +1529,7 @@ const struct inode_operations jfs_dir_inode_operations = {
 const struct file_operations jfs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = jfs_readdir,
+        .iterate        = jfs_readdir,
        .fsync          = jfs_fsync,
        .unlocked_ioctl = jfs_ioctl,
 #ifdef CONFIG_COMPAT
@@ -1538,8 +1538,7 @@ const struct file_operations jfs_dir_operations = {
        .llseek         = generic_file_llseek,
 };
-static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
+static int jfs_ci_hash(const struct dentry *dir, struct qstr *this)
-                struct qstr *this)
 {
        unsigned long hash;
        int i;
@@ -1552,9 +1551,7 @@ static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
        return 0;
 }
-static int jfs_ci_compare(const struct dentry *parent,
+static int jfs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
-                const struct inode *pinode,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        int i, result = 1;
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 8d0c1c7c0820..90b3bc21e9b0 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
        goto resume;
      error_out:
-        jfs_error(sb, "jfs_extendfs");
+        jfs_error(sb, "\n");
      resume:
        /*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 788e0a9c1fb0..6669aa2042c3 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -92,16 +92,20 @@ static void jfs_handle_error(struct super_block *sb)
        /* nothing is done for continue beyond marking the superblock dirty */
 }
-void jfs_error(struct super_block *sb, const char * function, ...)
+void jfs_error(struct super_block *sb, const char *fmt, ...)
 {
-        static char error_buf[256];
+        struct va_format vaf;
        va_list args;
-        va_start(args, function);
+        va_start(args, fmt);
-        vsnprintf(error_buf, sizeof(error_buf), function, args);
-        va_end(args);
+        vaf.fmt = fmt;
+        vaf.va = &args;
-        pr_err("ERROR: (device %s): %s\n", sb->s_id, error_buf);
+        pr_err("ERROR: (device %s): %pf: %pV\n",
+               sb->s_id, __builtin_return_address(0), &vaf);
+        va_end(args);
        jfs_handle_error(sb);
 }
@@ -617,7 +621,7 @@ static int jfs_freeze(struct super_block *sb)
                txQuiesce(sb);
                rc = lmLogShutdown(log);
                if (rc) {
-                        jfs_error(sb, "jfs_freeze: lmLogShutdown failed");
+                        jfs_error(sb, "lmLogShutdown failed\n");
                        /* let operations fail rather than hang */
                        txResume(sb);
@@ -646,12 +650,12 @@ static int jfs_unfreeze(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                rc = updateSuper(sb, FM_MOUNT);
                if (rc) {
-                        jfs_error(sb, "jfs_unfreeze: updateSuper failed");
+                        jfs_error(sb, "updateSuper failed\n");
                        goto out;
                }
                rc = lmLogInit(log);
                if (rc)
-                        jfs_error(sb, "jfs_unfreeze: lmLogInit failed");
+                        jfs_error(sb, "lmLogInit failed\n");
 out:
                txResume(sb);
        }
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 42d67f9757bf..d3472f4cd530 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -382,7 +382,7 @@ static int ea_read(struct inode *ip, struct jfs_ea_list *ealist)
        nbytes = sizeDXD(&ji->ea);
        if (!nbytes) {
-                jfs_error(sb, "ea_read: nbytes is 0");
+                jfs_error(sb, "nbytes is 0\n");
                return -EIO;
        }
@@ -482,7 +482,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
                current_blocks = 0;
        } else {
                if (!(ji->ea.flag & DXD_EXTENT)) {
-                        jfs_error(sb, "ea_get: invalid ea.flag)");
+                        jfs_error(sb, "invalid ea.flag\n");
                        return -EIO;
                }
                current_blocks = (ea_size + sb->s_blocksize - 1) >>
@@ -1089,8 +1089,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
 }
 #ifdef CONFIG_JFS_SECURITY
-int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-                   void *fs_info)
+                          void *fs_info)
 {
        const struct xattr *xattr;
        tid_t *tid = fs_info;
diff --git a/fs/libfs.c b/fs/libfs.c
index 916da8c4158b..3a3a9b53bf5a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -61,7 +61,8 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
-        d_set_d_op(dentry, &simple_dentry_operations);
+        if (!dentry->d_sb->s_d_op)
+                d_set_d_op(dentry, &simple_dentry_operations);
        d_add(dentry, NULL);
        return NULL;
 }
@@ -135,60 +136,40 @@ static inline unsigned char dt_type(struct inode *inode)
 * both impossible due to the lock on directory.
 */
-int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int dcache_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct dentry *dentry = file->f_path.dentry;
-        struct dentry *cursor = filp->private_data;
+        struct dentry *cursor = file->private_data;
        struct list_head *p, *q = &cursor->d_u.d_child;
-        ino_t ino;
-        int i = filp->f_pos;
-        switch (i) {
+        if (!dir_emit_dots(file, ctx))
-                case 0:
+                return 0;
-                        ino = dentry->d_inode->i_ino;
+        spin_lock(&dentry->d_lock);
-                        if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+        if (ctx->pos == 2)
-                                break;
+                list_move(q, &dentry->d_subdirs);
-                        filp->f_pos++;
-                        i++;
+        for (p = q->next; p != &dentry->d_subdirs; p = p->next) {
-                        /* fallthrough */
+                struct dentry *next = list_entry(p, struct dentry, d_u.d_child);
-                case 1:
+                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
-                        ino = parent_ino(dentry);
+                if (!simple_positive(next)) {
-                        if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+                        spin_unlock(&next->d_lock);
-                                break;
+                        continue;
-                        filp->f_pos++;
+                }
-                        i++;
-                        /* fallthrough */
-                default:
-                        spin_lock(&dentry->d_lock);
-                        if (filp->f_pos == 2)
-                                list_move(q, &dentry->d_subdirs);
-                        for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
-                                struct dentry *next;
-                                next = list_entry(p, struct dentry, d_u.d_child);
-                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
-                                if (!simple_positive(next)) {
-                                        spin_unlock(&next->d_lock);
-                                        continue;
-                                }
-                                spin_unlock(&next->d_lock);
+                spin_unlock(&next->d_lock);
-                                spin_unlock(&dentry->d_lock);
+                spin_unlock(&dentry->d_lock);
-                                if (filldir(dirent, next->d_name.name, 
+                if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
-                                            next->d_name.len, filp->f_pos, 
+                              next->d_inode->i_ino, dt_type(next->d_inode)))
-                                            next->d_inode->i_ino, 
+                        return 0;
-                                            dt_type(next->d_inode)) < 0)
+                spin_lock(&dentry->d_lock);
-                                        return 0;
+                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
-                                spin_lock(&dentry->d_lock);
+                /* next is still alive */
-                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+                list_move(q, p);
-                                /* next is still alive */
+                spin_unlock(&next->d_lock);
-                                list_move(q, p);
+                p = q;
-                                spin_unlock(&next->d_lock);
+                ctx->pos++;
-                                p = q;
-                                filp->f_pos++;
-                        }
-                        spin_unlock(&dentry->d_lock);
        }
+        spin_unlock(&dentry->d_lock);
        return 0;
 }
@@ -202,7 +183,7 @@ const struct file_operations simple_dir_operations = {
        .release        = dcache_dir_close,
        .llseek         = dcache_dir_lseek,
        .read           = generic_read_dir,
-        .readdir        = dcache_readdir,
+        .iterate        = dcache_readdir,
        .fsync          = noop_fsync,
 };
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 01bfe7662751..41e491b8e5d7 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -64,12 +64,17 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
                                   nlm_init->protocol, nlm_version,
                                   nlm_init->hostname, nlm_init->noresvport,
                                   nlm_init->net);
-        if (host == NULL) {
+        if (host == NULL)
-                lockd_down(nlm_init->net);
+                goto out_nohost;
-                return ERR_PTR(-ENOLCK);
+        if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL)
-        }
+                goto out_nobind;
        return host;
+out_nobind:
+        nlmclnt_release_host(host);
+out_nohost:
+        lockd_down(nlm_init->net);
+        return ERR_PTR(-ENOLCK);
 }
 EXPORT_SYMBOL_GPL(nlmclnt_init);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 9760ecb9b60f..acd394716349 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -125,14 +125,15 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
 {
        struct nlm_args *argp = &req->a_args;
        struct nlm_lock *lock = &argp->lock;
+        char *nodename = req->a_host->h_rpcclnt->cl_nodename;
        nlmclnt_next_cookie(&argp->cookie);
        memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh));
-        lock->caller  = utsname()->nodename;
+        lock->caller  = nodename;
        lock->oh.data = req->a_owner;
        lock->oh.len  = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
                                (unsigned int)fl->fl_u.nfs_fl.owner->pid,
-                                utsname()->nodename);
+                                nodename);
        lock->svid = fl->fl_u.nfs_fl.owner->pid;
        lock->fl.fl_start = fl->fl_start;
        lock->fl.fl_end = fl->fl_end;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index a2aa97d45670..10d6c41aecad 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -305,7 +305,7 @@ static int lockd_start_svc(struct svc_serv *serv)
        svc_sock_update_bufs(serv);
        serv->sv_maxconn = nlm_max_connections;
-        nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
+        nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name);
        if (IS_ERR(nlmsvc_task)) {
                error = PTR_ERR(nlmsvc_task);
                printk(KERN_WARNING
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e703318c41df..e066a3902973 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -276,7 +276,7 @@ static int nlmsvc_unlink_block(struct nlm_block *block)
        dprintk("lockd: unlinking block %p...\n", block);
        /* Remove block from list */
-        status = posix_unblock_lock(block->b_file->f_file, &block->b_call->a_args.lock.fl);
+        status = posix_unblock_lock(&block->b_call->a_args.lock.fl);
        nlmsvc_remove_block(block);
        return status;
 }
@@ -744,8 +744,20 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
        return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
 }
+/*
+ * Since NLM uses two "keys" for tracking locks, we need to hash them down
+ * to one for the blocked_hash. Here, we're just xor'ing the host address
+ * with the pid in order to create a key value for picking a hash bucket.
+ */
+static unsigned long
+nlmsvc_owner_key(struct file_lock *fl)
+{
+        return (unsigned long)fl->fl_owner ^ (unsigned long)fl->fl_pid;
+}
 const struct lock_manager_operations nlmsvc_lock_operations = {
        .lm_compare_owner = nlmsvc_same_owner,
+        .lm_owner_key = nlmsvc_owner_key,
        .lm_notify = nlmsvc_notify_blocked,
        .lm_grant = nlmsvc_grant_deferred,
 };
@@ -939,6 +951,7 @@ nlmsvc_retry_blocked(void)
        unsigned long   timeout = MAX_SCHEDULE_TIMEOUT;
        struct nlm_block *block;
+        spin_lock(&nlm_blocked_lock);
        while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
                block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
@@ -948,6 +961,7 @@ nlmsvc_retry_blocked(void)
                        timeout = block->b_when - jiffies;
                        break;
                }
+                spin_unlock(&nlm_blocked_lock);
                dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
                        block, block->b_when);
@@ -957,7 +971,9 @@ nlmsvc_retry_blocked(void)
                        retry_deferred_block(block);
                } else
                        nlmsvc_grant_blocked(block);
+                spin_lock(&nlm_blocked_lock);
        }
+        spin_unlock(&nlm_blocked_lock);
        return timeout;
 }
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 97e87415b145..dc5c75930f0f 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -169,7 +169,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 again:
        file->f_locks = 0;
-        lock_flocks(); /* protects i_flock list */
+        spin_lock(&inode->i_lock);
        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
                if (fl->fl_lmops != &nlmsvc_lock_operations)
                        continue;
@@ -181,7 +181,7 @@ again:
                if (match(lockhost, host)) {
                        struct file_lock lock = *fl;
-                        unlock_flocks();
+                        spin_unlock(&inode->i_lock);
                        lock.fl_type  = F_UNLCK;
                        lock.fl_start = 0;
                        lock.fl_end   = OFFSET_MAX;
@@ -193,7 +193,7 @@ again:
                        goto again;
                }
        }
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        return 0;
 }
@@ -228,14 +228,14 @@ nlm_file_inuse(struct nlm_file *file)
        if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
                return 1;
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
                if (fl->fl_lmops == &nlmsvc_lock_operations) {
-                        unlock_flocks();
+                        spin_unlock(&inode->i_lock);
                        return 1;
                }
        }
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        file->f_locks = 0;
        return 0;
 }
diff --git a/fs/locks.c b/fs/locks.c
index cb424a4fed71..b27a3005d78d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -126,6 +126,9 @@
 #include <linux/time.h>
 #include <linux/rcupdate.h>
 #include <linux/pid_namespace.h>
+#include <linux/hashtable.h>
+#include <linux/percpu.h>
+#include <linux/lglock.h>
 #include <asm/uaccess.h>
@@ -153,30 +156,53 @@ int lease_break_time = 45;
 #define for_each_lock(inode, lockp) \
        for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
-static LIST_HEAD(file_lock_list);
+/*
-static LIST_HEAD(blocked_list);
+ * The global file_lock_list is only used for displaying /proc/locks, so we
-static DEFINE_SPINLOCK(file_lock_lock);
+ * keep a list on each CPU, with each list protected by its own spinlock via
+ * the file_lock_lglock. Note that alterations to the list also require that
+ * the relevant i_lock is held.
+ */
+DEFINE_STATIC_LGLOCK(file_lock_lglock);
+static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
 /*
- * Protects the two list heads above, plus the inode->i_flock list
+ * The blocked_hash is used to find POSIX lock loops for deadlock detection.
+ * It is protected by blocked_lock_lock.
+ *
+ * We hash locks by lockowner in order to optimize searching for the lock a
+ * particular lockowner is waiting on.
+ *
+ * FIXME: make this value scale via some heuristic? We generally will want more
+ * buckets when we have more lockowners holding locks, but that's a little
+ * difficult to determine without knowing what the workload will look like.
 */
-void lock_flocks(void)
+#define BLOCKED_HASH_BITS       7
-{
+static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
-        spin_lock(&file_lock_lock);
-}
-EXPORT_SYMBOL_GPL(lock_flocks);
-void unlock_flocks(void)
+/*
-{
+ * This lock protects the blocked_hash. Generally, if you're accessing it, you
-        spin_unlock(&file_lock_lock);
+ * want to be holding this lock.
-}
+ *
-EXPORT_SYMBOL_GPL(unlock_flocks);
+ * In addition, it also protects the fl->fl_block list, and the fl->fl_next
+ * pointer for file_lock structures that are acting as lock requests (in
+ * contrast to those that are acting as records of acquired locks).
+ *
+ * Note that when we acquire this lock in order to change the above fields,
+ * we often hold the i_lock as well. In certain cases, when reading the fields
+ * protected by this lock, we can skip acquiring it iff we already hold the
+ * i_lock.
+ *
+ * In particular, adding an entry to the fl_block list requires that you hold
+ * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting
+ * an entry from the list however only requires the file_lock_lock.
+ */
+static DEFINE_SPINLOCK(blocked_lock_lock);
 static struct kmem_cache *filelock_cache __read_mostly;
 static void locks_init_lock_heads(struct file_lock *fl)
 {
-        INIT_LIST_HEAD(&fl->fl_link);
+        INIT_HLIST_NODE(&fl->fl_link);
        INIT_LIST_HEAD(&fl->fl_block);
        init_waitqueue_head(&fl->fl_wait);
 }
@@ -210,7 +236,7 @@ void locks_free_lock(struct file_lock *fl)
 {
        BUG_ON(waitqueue_active(&fl->fl_wait));
        BUG_ON(!list_empty(&fl->fl_block));
-        BUG_ON(!list_empty(&fl->fl_link));
+        BUG_ON(!hlist_unhashed(&fl->fl_link));
        locks_release_private(fl);
        kmem_cache_free(filelock_cache, fl);
@@ -484,47 +510,118 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
        return fl1->fl_owner == fl2->fl_owner;
 }
+/* Must be called with the i_lock held! */
+static inline void
+locks_insert_global_locks(struct file_lock *fl)
+{
+        lg_local_lock(&file_lock_lglock);
+        fl->fl_link_cpu = smp_processor_id();
+        hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
+        lg_local_unlock(&file_lock_lglock);
+}
+/* Must be called with the i_lock held! */
+static inline void
+locks_delete_global_locks(struct file_lock *fl)
+{
+        /*
+         * Avoid taking lock if already unhashed. This is safe since this check
+         * is done while holding the i_lock, and new insertions into the list
+         * also require that it be held.
+         */
+        if (hlist_unhashed(&fl->fl_link))
+                return;
+        lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+        hlist_del_init(&fl->fl_link);
+        lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+}
+static unsigned long
+posix_owner_key(struct file_lock *fl)
+{
+        if (fl->fl_lmops && fl->fl_lmops->lm_owner_key)
+                return fl->fl_lmops->lm_owner_key(fl);
+        return (unsigned long)fl->fl_owner;
+}
+static inline void
+locks_insert_global_blocked(struct file_lock *waiter)
+{
+        hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
+}
+static inline void
+locks_delete_global_blocked(struct file_lock *waiter)
+{
+        hash_del(&waiter->fl_link);
+}
 /* Remove waiter from blocker's block list.
 * When blocker ends up pointing to itself then the list is empty.
+ *
+ * Must be called with blocked_lock_lock held.
 */
 static void __locks_delete_block(struct file_lock *waiter)
 {
+        locks_delete_global_blocked(waiter);
        list_del_init(&waiter->fl_block);
-        list_del_init(&waiter->fl_link);
        waiter->fl_next = NULL;
 }
-/*
+static void locks_delete_block(struct file_lock *waiter)
- */
-void locks_delete_block(struct file_lock *waiter)
 {
-        lock_flocks();
+        spin_lock(&blocked_lock_lock);
        __locks_delete_block(waiter);
-        unlock_flocks();
+        spin_unlock(&blocked_lock_lock);
 }
-EXPORT_SYMBOL(locks_delete_block);
 /* Insert waiter into blocker's block list.
 * We use a circular list so that processes can be easily woken up in
 * the order they blocked. The documentation doesn't require this but
 * it seems like the reasonable thing to do.
+ *
+ * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
+ * list itself is protected by the file_lock_list, but by ensuring that the
+ * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
+ * in some cases when we see that the fl_block list is empty.
 */
-static void locks_insert_block(struct file_lock *blocker, 
+static void __locks_insert_block(struct file_lock *blocker,
-                               struct file_lock *waiter)
+                                        struct file_lock *waiter)
 {
        BUG_ON(!list_empty(&waiter->fl_block));
-        list_add_tail(&waiter->fl_block, &blocker->fl_block);
        waiter->fl_next = blocker;
+        list_add_tail(&waiter->fl_block, &blocker->fl_block);
        if (IS_POSIX(blocker))
-                list_add(&waiter->fl_link, &blocked_list);
+                locks_insert_global_blocked(waiter);
 }
-/* Wake up processes blocked waiting for blocker.
+/* Must be called with i_lock held. */
- * If told to wait then schedule the processes until the block list
+static void locks_insert_block(struct file_lock *blocker,
- * is empty, otherwise empty the block list ourselves.
+                                        struct file_lock *waiter)
+{
+        spin_lock(&blocked_lock_lock);
+        __locks_insert_block(blocker, waiter);
+        spin_unlock(&blocked_lock_lock);
+}
+/*
+ * Wake up processes blocked waiting for blocker.
+ *
+ * Must be called with the inode->i_lock held!
 */
 static void locks_wake_up_blocks(struct file_lock *blocker)
 {
+        /*
+         * Avoid taking global lock if list is empty. This is safe since new
+         * blocked requests are only added to the list under the i_lock, and
+         * the i_lock is always held here. Note that removal from the fl_block
+         * list does not require the i_lock, so we must recheck list_empty()
+         * after acquiring the blocked_lock_lock.
+         */
+        if (list_empty(&blocker->fl_block))
+                return;
+        spin_lock(&blocked_lock_lock);
        while (!list_empty(&blocker->fl_block)) {
                struct file_lock *waiter;
@@ -536,20 +633,23 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
                else
                        wake_up(&waiter->fl_wait);
        }
+        spin_unlock(&blocked_lock_lock);
 }
 /* Insert file lock fl into an inode's lock list at the position indicated
 * by pos. At the same time add the lock to the global file lock list.
+ *
+ * Must be called with the i_lock held!
 */
 static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
 {
-        list_add(&fl->fl_link, &file_lock_list);
        fl->fl_nspid = get_pid(task_tgid(current));
        /* insert into file's list */
        fl->fl_next = *pos;
        *pos = fl;
+        locks_insert_global_locks(fl);
 }
 /*
@@ -557,14 +657,17 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
 * Wake up processes that are blocked waiting for this lock,
 * notify the FS that the lock has been cleared and
 * finally free the lock.
+ *
+ * Must be called with the i_lock held!
 */
 static void locks_delete_lock(struct file_lock **thisfl_p)
 {
        struct file_lock *fl = *thisfl_p;
+        locks_delete_global_locks(fl);
        *thisfl_p = fl->fl_next;
        fl->fl_next = NULL;
-        list_del_init(&fl->fl_link);
        if (fl->fl_nspid) {
                put_pid(fl->fl_nspid);
@@ -625,8 +728,9 @@ void
 posix_test_lock(struct file *filp, struct file_lock *fl)
 {
        struct file_lock *cfl;
+        struct inode *inode = file_inode(filp);
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
                if (!IS_POSIX(cfl))
                        continue;
@@ -639,7 +743,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
                        fl->fl_pid = pid_vnr(cfl->fl_nspid);
        } else
                fl->fl_type = F_UNLCK;
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        return;
 }
 EXPORT_SYMBOL(posix_test_lock);
@@ -676,13 +780,14 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
 {
        struct file_lock *fl;
-        list_for_each_entry(fl, &blocked_list, fl_link) {
+        hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
                if (posix_same_owner(fl, block_fl))
                        return fl->fl_next;
        }
        return NULL;
 }
+/* Must be called with the blocked_lock_lock held! */
 static int posix_locks_deadlock(struct file_lock *caller_fl,
                                struct file_lock *block_fl)
 {
@@ -718,7 +823,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
                        return -ENOMEM;
        }
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        if (request->fl_flags & FL_ACCESS)
                goto find_conflict;
@@ -748,9 +853,9 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
         * give it the opportunity to lock the file.
         */
        if (found) {
-                unlock_flocks();
+                spin_unlock(&inode->i_lock);
                cond_resched();
-                lock_flocks();
+                spin_lock(&inode->i_lock);
        }
 find_conflict:
@@ -777,7 +882,7 @@ find_conflict:
        error = 0;
 out:
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        if (new_fl)
                locks_free_lock(new_fl);
        return error;
@@ -791,7 +896,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
        struct file_lock *left = NULL;
        struct file_lock *right = NULL;
        struct file_lock **before;
-        int error, added = 0;
+        int error;
+        bool added = false;
        /*
         * We may need two file_lock structures for this operation,
@@ -806,7 +912,12 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                new_fl2 = locks_alloc_lock();
        }
-        lock_flocks();
+        spin_lock(&inode->i_lock);
+        /*
+         * New lock request. Walk all POSIX locks and look for conflicts. If
+         * there are any, either return error or put the request on the
+         * blocker's list of waiters and the global blocked_hash.
+         */
        if (request->fl_type != F_UNLCK) {
                for_each_lock(inode, before) {
                        fl = *before;
@@ -819,11 +930,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                        error = -EAGAIN;
                        if (!(request->fl_flags & FL_SLEEP))
                                goto out;
+                        /*
+                         * Deadlock detection and insertion into the blocked
+                         * locks list must be done while holding the same lock!
+                         */
                        error = -EDEADLK;
-                        if (posix_locks_deadlock(request, fl))
+                        spin_lock(&blocked_lock_lock);
-                                goto out;
+                        if (likely(!posix_locks_deadlock(request, fl))) {
-                        error = FILE_LOCK_DEFERRED;
+                                error = FILE_LOCK_DEFERRED;
-                        locks_insert_block(fl, request);
+                                __locks_insert_block(fl, request);
+                        }
+                        spin_unlock(&blocked_lock_lock);
                        goto out;
                }
        }
@@ -845,7 +962,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                before = &fl->fl_next;
        }
-        /* Process locks with this owner.  */
+        /* Process locks with this owner. */
        while ((fl = *before) && posix_same_owner(request, fl)) {
                /* Detect adjacent or overlapping regions (if same lock type)
                 */
@@ -880,7 +997,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                                continue;
                        }
                        request = fl;
-                        added = 1;
+                        added = true;
                }
                else {
                        /* Processing for different lock types is a bit
@@ -891,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                        if (fl->fl_start > request->fl_end)
                                break;
                        if (request->fl_type == F_UNLCK)
-                                added = 1;
+                                added = true;
                        if (fl->fl_start < request->fl_start)
                                left = fl;
                        /* If the next lock in the list has a higher end
@@ -921,7 +1038,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                                locks_release_private(fl);
                                locks_copy_private(fl, request);
                                request = fl;
-                                added = 1;
+                                added = true;
                        }
                }
                /* Go on to next lock.
@@ -931,10 +1048,9 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
        }
        /*
-         * The above code only modifies existing locks in case of
+         * The above code only modifies existing locks in case of merging or
-         * merging or replacing.  If new lock(s) need to be inserted
+         * replacing. If new lock(s) need to be inserted all modifications are
-         * all modifications are done bellow this, so it's safe yet to
+         * done below this, so it's safe yet to bail out.
-         * bail out.
         */
        error = -ENOLCK; /* "no luck" */
        if (right && left == right && !new_fl2)
@@ -974,7 +1090,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                locks_wake_up_blocks(left);
        }
 out:
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        /*
         * Free any unused locks.
         */
@@ -1049,14 +1165,14 @@ int locks_mandatory_locked(struct inode *inode)
        /*
         * Search the lock list for this inode for any POSIX locks.
         */
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!IS_POSIX(fl))
                        continue;
                if (fl->fl_owner != owner)
                        break;
        }
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        return fl ? -EAGAIN : 0;
 }
@@ -1199,7 +1315,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
        if (IS_ERR(new_fl))
                return PTR_ERR(new_fl);
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        time_out_leases(inode);
@@ -1249,11 +1365,11 @@ restart:
                        break_time++;
        }
        locks_insert_block(flock, new_fl);
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        error = wait_event_interruptible_timeout(new_fl->fl_wait,
                                                !new_fl->fl_next, break_time);
-        lock_flocks();
+        spin_lock(&inode->i_lock);
-        __locks_delete_block(new_fl);
+        locks_delete_block(new_fl);
        if (error >= 0) {
                if (error == 0)
                        time_out_leases(inode);
@@ -1270,7 +1386,7 @@ restart:
        }
 out:
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        locks_free_lock(new_fl);
        return error;
 }
@@ -1323,9 +1439,10 @@ EXPORT_SYMBOL(lease_get_mtime);
 int fcntl_getlease(struct file *filp)
 {
        struct file_lock *fl;
+        struct inode *inode = file_inode(filp);
        int type = F_UNLCK;
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        time_out_leases(file_inode(filp));
        for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
                        fl = fl->fl_next) {
@@ -1334,11 +1451,11 @@ int fcntl_getlease(struct file *filp)
                        break;
                }
        }
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        return type;
 }
-int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
+static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
 {
        struct file_lock *fl, **before, **my_before = NULL, *lease;
        struct dentry *dentry = filp->f_path.dentry;
@@ -1351,7 +1468,7 @@ int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
        if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
                goto out;
        if ((arg == F_WRLCK)
-            && ((dentry->d_count > 1)
+            && ((d_count(dentry) > 1)
                || (atomic_read(&inode->i_count) > 1)))
                goto out;
@@ -1403,7 +1520,7 @@ out:
        return error;
 }
-int generic_delete_lease(struct file *filp, struct file_lock **flp)
+static int generic_delete_lease(struct file *filp, struct file_lock **flp)
 {
        struct file_lock *fl, **before;
        struct dentry *dentry = filp->f_path.dentry;
@@ -1428,7 +1545,7 @@ int generic_delete_lease(struct file *filp, struct file_lock **flp)
 *      The (input) flp->fl_lmops->lm_break function is required
 *      by break_lease().
 *
- *      Called with file_lock_lock held.
+ *      Called with inode->i_lock held.
 */
 int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 {
@@ -1497,11 +1614,12 @@ static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 {
+        struct inode *inode = file_inode(filp);
        int error;
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        error = __vfs_setlease(filp, arg, lease);
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        return error;
 }
@@ -1519,6 +1637,7 @@ static int do_fcntl_delete_lease(struct file *filp)
 static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 {
        struct file_lock *fl, *ret;
+        struct inode *inode = file_inode(filp);
        struct fasync_struct *new;
        int error;
@@ -1532,10 +1651,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
                return -ENOMEM;
        }
        ret = fl;
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        error = __vfs_setlease(filp, arg, &ret);
        if (error) {
-                unlock_flocks();
+                spin_unlock(&inode->i_lock);
                locks_free_lock(fl);
                goto out_free_fasync;
        }
@@ -1552,7 +1671,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
                new = NULL;
        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
 out_free_fasync:
        if (new)
@@ -2076,7 +2195,7 @@ void locks_remove_flock(struct file *filp)
                        fl.fl_ops->fl_release_private(&fl);
        }
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        before = &inode->i_flock;
        while ((fl = *before) != NULL) {
@@ -2094,30 +2213,28 @@ void locks_remove_flock(struct file *filp)
                }
                before = &fl->fl_next;
        }
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
 }
 /**
 *      posix_unblock_lock - stop waiting for a file lock
- *      @filp:   how the file was opened
 *      @waiter: the lock which was waiting
 *
 *      lockd needs to block waiting for locks.
 */
 int
-posix_unblock_lock(struct file *filp, struct file_lock *waiter)
+posix_unblock_lock(struct file_lock *waiter)
 {
        int status = 0;
-        lock_flocks();
+        spin_lock(&blocked_lock_lock);
        if (waiter->fl_next)
                __locks_delete_block(waiter);
        else
                status = -ENOENT;
-        unlock_flocks();
+        spin_unlock(&blocked_lock_lock);
        return status;
 }
 EXPORT_SYMBOL(posix_unblock_lock);
 /**
@@ -2140,6 +2257,11 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+struct locks_iterator {
+        int     li_cpu;
+        loff_t  li_pos;
+};
 static void lock_get_status(struct seq_file *f, struct file_lock *fl,
                            loff_t id, char *pfx)
 {
@@ -2213,37 +2335,41 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 static int locks_show(struct seq_file *f, void *v)
 {
+        struct locks_iterator *iter = f->private;
        struct file_lock *fl, *bfl;
-        fl = list_entry(v, struct file_lock, fl_link);
+        fl = hlist_entry(v, struct file_lock, fl_link);
-        lock_get_status(f, fl, *((loff_t *)f->private), "");
+        lock_get_status(f, fl, iter->li_pos, "");
        list_for_each_entry(bfl, &fl->fl_block, fl_block)
-                lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
+                lock_get_status(f, bfl, iter->li_pos, " ->");
        return 0;
 }
 static void *locks_start(struct seq_file *f, loff_t *pos)
 {
-        loff_t *p = f->private;
+        struct locks_iterator *iter = f->private;
-        lock_flocks();
+        iter->li_pos = *pos + 1;
-        *p = (*pos + 1);
+        lg_global_lock(&file_lock_lglock);
-        return seq_list_start(&file_lock_list, *pos);
+        spin_lock(&blocked_lock_lock);
+        return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
 }
 static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 {
-        loff_t *p = f->private;
+        struct locks_iterator *iter = f->private;
-        ++*p;
-        return seq_list_next(v, &file_lock_list, pos);
+        ++iter->li_pos;
+        return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
 }
 static void locks_stop(struct seq_file *f, void *v)
 {
-        unlock_flocks();
+        spin_unlock(&blocked_lock_lock);
+        lg_global_unlock(&file_lock_lglock);
 }
 static const struct seq_operations locks_seq_operations = {
@@ -2255,7 +2381,8 @@ static const struct seq_operations locks_seq_operations = {
 static int locks_open(struct inode *inode, struct file *filp)
 {
-        return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
+        return seq_open_private(filp, &locks_seq_operations,
+                                        sizeof(struct locks_iterator));
 }
 static const struct file_operations proc_locks_operations = {
@@ -2290,7 +2417,8 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
 {
        struct file_lock *fl;
        int result = 1;
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (IS_POSIX(fl)) {
                        if (fl->fl_type == F_RDLCK)
@@ -2307,7 +2435,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
                result = 0;
                break;
        }
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        return result;
 }
@@ -2330,7 +2458,8 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 {
        struct file_lock *fl;
        int result = 1;
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (IS_POSIX(fl)) {
                        if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2345,7 +2474,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
                result = 0;
                break;
        }
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        return result;
 }
@@ -2353,9 +2482,16 @@ EXPORT_SYMBOL(lock_may_write);
 static int __init filelock_init(void)
 {
+        int i;
        filelock_cache = kmem_cache_create("file_lock_cache",
                        sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+        lg_lock_init(&file_lock_lglock, "file_lock_lglock");
+        for_each_possible_cpu(i)
+                INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
        return 0;
 }
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b82751082112..6bdc347008f5 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -281,17 +281,23 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
 /* FIXME: readdir currently has it's own dir_walk code.  I don't see a good
 * way to combine the two copies */
-#define IMPLICIT_NODES 2
+static int logfs_readdir(struct file *file, struct dir_context *ctx)
-static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
 {
        struct inode *dir = file_inode(file);
-        loff_t pos = file->f_pos - IMPLICIT_NODES;
+        loff_t pos;
        struct page *page;
        struct logfs_disk_dentry *dd;
-        int full;
+        if (ctx->pos < 0)
+                return -EINVAL;
+        if (!dir_emit_dots(file, ctx))
+                return 0;
+        pos = ctx->pos - 2;
        BUG_ON(pos < 0);
-        for (;; pos++) {
+        for (;; pos++, ctx->pos++) {
+                bool full;
                if (beyond_eof(dir, pos))
                        break;
                if (!logfs_exist_block(dir, pos)) {
@@ -306,42 +312,17 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
                dd = kmap(page);
                BUG_ON(dd->namelen == 0);
-                full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
+                full = !dir_emit(ctx, (char *)dd->name,
-                                pos, be64_to_cpu(dd->ino), dd->type);
+                                be16_to_cpu(dd->namelen),
+                                be64_to_cpu(dd->ino), dd->type);
                kunmap(page);
                page_cache_release(page);
                if (full)
                        break;
        }
-        file->f_pos = pos + IMPLICIT_NODES;
        return 0;
 }
-static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
-{
-        struct inode *inode = file_inode(file);
-        ino_t pino = parent_ino(file->f_dentry);
-        int err;
-        if (file->f_pos < 0)
-                return -EINVAL;
-        if (file->f_pos == 0) {
-                if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
-                        return 0;
-                file->f_pos++;
-        }
-        if (file->f_pos == 1) {
-                if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
-                        return 0;
-                file->f_pos++;
-        }
-        err = __logfs_readdir(file, buf, filldir);
-        return err;
-}
 static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
 {
        dd->namelen = cpu_to_be16(name->len);
@@ -814,7 +795,7 @@ const struct inode_operations logfs_dir_iops = {
 const struct file_operations logfs_dir_fops = {
        .fsync          = logfs_fsync,
        .unlocked_ioctl = logfs_ioctl,
-        .readdir        = logfs_readdir,
+        .iterate        = logfs_readdir,
        .read           = generic_read_dir,
        .llseek         = default_llseek,
 };
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index c2219a6dd3c8..57914fc32b62 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -159,7 +159,8 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
        return __logfs_writepage(page);
 }
-static void logfs_invalidatepage(struct page *page, unsigned long offset)
+static void logfs_invalidatepage(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        struct logfs_block *block = logfs_block(page);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 038da0991794..d448a777166b 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -884,7 +884,8 @@ static struct logfs_area *alloc_area(struct super_block *sb)
        return area;
 }
-static void map_invalidatepage(struct page *page, unsigned long l)
+static void map_invalidatepage(struct page *page, unsigned int o,
+                               unsigned int l)
 {
        return;
 }
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index a9ed6f36e6ea..dfaf6fa9b7b5 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -16,12 +16,12 @@
 typedef struct minix_dir_entry minix_dirent;
 typedef struct minix3_dir_entry minix3_dirent;
-static int minix_readdir(struct file *, void *, filldir_t);
+static int minix_readdir(struct file *, struct dir_context *);
 const struct file_operations minix_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = minix_readdir,
+        .iterate        = minix_readdir,
        .fsync          = generic_file_fsync,
 };
@@ -82,22 +82,23 @@ static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
        return (void*)((char*)de + sbi->s_dirsize);
 }
-static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int minix_readdir(struct file *file, struct dir_context *ctx)
 {
-        unsigned long pos = filp->f_pos;
+        struct inode *inode = file_inode(file);
-        struct inode *inode = file_inode(filp);
        struct super_block *sb = inode->i_sb;
-        unsigned offset = pos & ~PAGE_CACHE_MASK;
-        unsigned long n = pos >> PAGE_CACHE_SHIFT;
-        unsigned long npages = dir_pages(inode);
        struct minix_sb_info *sbi = minix_sb(sb);
        unsigned chunk_size = sbi->s_dirsize;
-        char *name;
+        unsigned long npages = dir_pages(inode);
-        __u32 inumber;
+        unsigned long pos = ctx->pos;
+        unsigned offset;
+        unsigned long n;
-        pos = (pos + chunk_size-1) & ~(chunk_size-1);
+        ctx->pos = pos = ALIGN(pos, chunk_size);
        if (pos >= inode->i_size)
-                goto done;
+                return 0;
+        offset = pos & ~PAGE_CACHE_MASK;
+        n = pos >> PAGE_CACHE_SHIFT;
        for ( ; n < npages; n++, offset = 0) {
                char *p, *kaddr, *limit;
@@ -109,6 +110,8 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
                p = kaddr+offset;
                limit = kaddr + minix_last_byte(inode, n) - chunk_size;
                for ( ; p <= limit; p = minix_next_entry(p, sbi)) {
+                        const char *name;
+                        __u32 inumber;
                        if (sbi->s_version == MINIX_V3) {
                                minix3_dirent *de3 = (minix3_dirent *)p;
                                name = de3->name;
@@ -119,24 +122,17 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
                                inumber = de->inode;
                        }
                        if (inumber) {
-                                int over;
                                unsigned l = strnlen(name, sbi->s_namelen);
-                                offset = p - kaddr;
+                                if (!dir_emit(ctx, name, l,
-                                over = filldir(dirent, name, l,
+                                              inumber, DT_UNKNOWN)) {
-                                        (n << PAGE_CACHE_SHIFT) | offset,
-                                        inumber, DT_UNKNOWN);
-                                if (over) {
                                        dir_put_page(page);
-                                        goto done;
+                                        return 0;
                                }
                        }
+                        ctx->pos += chunk_size;
                }
                dir_put_page(page);
        }
-done:
-        filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
        return 0;
 }
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 0db73d9dd668..cd950e2331b6 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -54,6 +54,18 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode,
        return error;
 }
+static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+        int error;
+        struct inode *inode = minix_new_inode(dir, mode, &error);
+        if (inode) {
+                minix_set_inode(inode, 0);
+                mark_inode_dirty(inode);
+                d_tmpfile(dentry, inode);
+        }
+        return error;
+}
 static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                bool excl)
 {
@@ -254,4 +266,5 @@ const struct inode_operations minix_dir_inode_operations = {
        .mknod          = minix_mknod,
        .rename         = minix_rename,
        .getattr        = minix_getattr,
+        .tmpfile        = minix_tmpfile,
 };
diff --git a/fs/namei.c b/fs/namei.c
index 9ed9361223c0..89a612e392eb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1352,7 +1352,7 @@ static int lookup_fast(struct nameidata *nd,
         */
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
-                dentry = __d_lookup_rcu(parent, &nd->last, &seq, nd->inode);
+                dentry = __d_lookup_rcu(parent, &nd->last, &seq);
                if (!dentry)
                        goto unlazy;
@@ -1787,8 +1787,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                        struct dentry *parent = nd->path.dentry;
                        nd->flags &= ~LOOKUP_JUMPED;
                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-                                err = parent->d_op->d_hash(parent, nd->inode,
+                                err = parent->d_op->d_hash(parent, &this);
-                                                           &this);
                                if (err < 0)
                                        break;
                        }
@@ -2121,7 +2120,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
         * to use its own hash..
         */
        if (base->d_flags & DCACHE_OP_HASH) {
-                int err = base->d_op->d_hash(base, base->d_inode, &this);
+                int err = base->d_op->d_hash(base, &this);
                if (err < 0)
                        return ERR_PTR(err);
        }
@@ -2690,28 +2689,10 @@ static int do_last(struct nameidata *nd, struct path *path,
        nd->flags &= ~LOOKUP_PARENT;
        nd->flags |= op->intent;
-        switch (nd->last_type) {
+        if (nd->last_type != LAST_NORM) {
-        case LAST_DOTDOT:
-        case LAST_DOT:
                error = handle_dots(nd, nd->last_type);
                if (error)
                        return error;
-                /* fallthrough */
-        case LAST_ROOT:
-                error = complete_walk(nd);
-                if (error)
-                        return error;
-                audit_inode(name, nd->path.dentry, 0);
-                if (open_flag & O_CREAT) {
-                        error = -EISDIR;
-                        goto out;
-                }
-                goto finish_open;
-        case LAST_BIND:
-                error = complete_walk(nd);
-                if (error)
-                        return error;
-                audit_inode(name, dir, 0);
                goto finish_open;
        }
@@ -2841,19 +2822,19 @@ finish_lookup:
        }
        nd->inode = inode;
        /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
+finish_open:
        error = complete_walk(nd);
        if (error) {
                path_put(&save_parent);
                return error;
        }
+        audit_inode(name, nd->path.dentry, 0);
        error = -EISDIR;
        if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
                goto out;
        error = -ENOTDIR;
        if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode))
                goto out;
-        audit_inode(name, nd->path.dentry, 0);
-finish_open:
        if (!S_ISREG(nd->inode->i_mode))
                will_truncate = false;
@@ -2920,6 +2901,67 @@ stale_open:
        goto retry_lookup;
 }
+static int do_tmpfile(int dfd, struct filename *pathname,
+                struct nameidata *nd, int flags,
+                const struct open_flags *op,
+                struct file *file, int *opened)
+{
+        static const struct qstr name = QSTR_INIT("/", 1);
+        struct dentry *dentry, *child;
+        struct inode *dir;
+        int error = path_lookupat(dfd, pathname->name,
+                                  flags | LOOKUP_DIRECTORY, nd);
+        if (unlikely(error))
+                return error;
+        error = mnt_want_write(nd->path.mnt);
+        if (unlikely(error))
+                goto out;
+        /* we want directory to be writable */
+        error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
+        if (error)
+                goto out2;
+        dentry = nd->path.dentry;
+        dir = dentry->d_inode;
+        if (!dir->i_op->tmpfile) {
+                error = -EOPNOTSUPP;
+                goto out2;
+        }
+        child = d_alloc(dentry, &name);
+        if (unlikely(!child)) {
+                error = -ENOMEM;
+                goto out2;
+        }
+        nd->flags &= ~LOOKUP_DIRECTORY;
+        nd->flags |= op->intent;
+        dput(nd->path.dentry);
+        nd->path.dentry = child;
+        error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode);
+        if (error)
+                goto out2;
+        audit_inode(pathname, nd->path.dentry, 0);
+        error = may_open(&nd->path, op->acc_mode, op->open_flag);
+        if (error)
+                goto out2;
+        file->f_path.mnt = nd->path.mnt;
+        error = finish_open(file, nd->path.dentry, NULL, opened);
+        if (error)
+                goto out2;
+        error = open_check_o_direct(file);
+        if (error) {
+                fput(file);
+        } else if (!(op->open_flag & O_EXCL)) {
+                struct inode *inode = file_inode(file);
+                spin_lock(&inode->i_lock);
+                inode->i_state |= I_LINKABLE;
+                spin_unlock(&inode->i_lock);
+        }
+out2:
+        mnt_drop_write(nd->path.mnt);
+out:
+        path_put(&nd->path);
+        return error;
+}
 static struct file *path_openat(int dfd, struct filename *pathname,
                struct nameidata *nd, const struct open_flags *op, int flags)
 {
@@ -2935,6 +2977,11 @@ static struct file *path_openat(int dfd, struct filename *pathname,
        file->f_flags = op->open_flag;
+        if (unlikely(file->f_flags & __O_TMPFILE)) {
+                error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
+                goto out;
+        }
        error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
        if (unlikely(error))
                goto out;
@@ -2987,9 +3034,10 @@ out:
 }
 struct file *do_filp_open(int dfd, struct filename *pathname,
-                const struct open_flags *op, int flags)
+                const struct open_flags *op)
 {
        struct nameidata nd;
+        int flags = op->lookup_flags;
        struct file *filp;
        filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
@@ -3001,17 +3049,16 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
 }
 struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
-                const char *name, const struct open_flags *op, int flags)
+                const char *name, const struct open_flags *op)
 {
        struct nameidata nd;
        struct file *file;
        struct filename filename = { .name = name };
+        int flags = op->lookup_flags | LOOKUP_ROOT;
        nd.root.mnt = mnt;
        nd.root.dentry = dentry;
-        flags |= LOOKUP_ROOT;
        if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
                return ERR_PTR(-ELOOP);
@@ -3586,12 +3633,18 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
        mutex_lock(&inode->i_mutex);
        /* Make sure we don't allow creating hardlink to an unlinked file */
-        if (inode->i_nlink == 0)
+        if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
                error =  -ENOENT;
        else if (max_links && inode->i_nlink >= max_links)
                error = -EMLINK;
        else
                error = dir->i_op->link(old_dentry, dir, new_dentry);
+        if (!error && (inode->i_state & I_LINKABLE)) {
+                spin_lock(&inode->i_lock);
+                inode->i_state &= ~I_LINKABLE;
+                spin_unlock(&inode->i_lock);
+        }
        mutex_unlock(&inode->i_mutex);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
@@ -3618,15 +3671,11 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;
        /*
-         * To use null names we require CAP_DAC_READ_SEARCH
+         * Using empty names is equivalent to using AT_SYMLINK_FOLLOW
-         * This ensures that not everyone will be able to create
+         * on /proc/self/fd/<fd>.
-         * handlink using the passed filedescriptor.
         */
-        if (flags & AT_EMPTY_PATH) {
+        if (flags & AT_EMPTY_PATH)
-                if (!capable(CAP_DAC_READ_SEARCH))
-                        return -ENOENT;
                how = LOOKUP_EMPTY;
-        }
        if (flags & AT_SYMLINK_FOLLOW)
                how |= LOOKUP_FOLLOW;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 6792ce11f2bf..3be047474bfc 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -23,12 +23,12 @@
 #include "ncp_fs.h"
-static void ncp_read_volume_list(struct file *, void *, filldir_t,
+static void ncp_read_volume_list(struct file *, struct dir_context *,
                                struct ncp_cache_control *);
-static void ncp_do_readdir(struct file *, void *, filldir_t,
+static void ncp_do_readdir(struct file *, struct dir_context *,
                                struct ncp_cache_control *);
-static int ncp_readdir(struct file *, void *, filldir_t);
+static int ncp_readdir(struct file *, struct dir_context *);
 static int ncp_create(struct inode *, struct dentry *, umode_t, bool);
 static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int);
@@ -49,7 +49,7 @@ const struct file_operations ncp_dir_operations =
 {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = ncp_readdir,
+        .iterate        = ncp_readdir,
        .unlocked_ioctl = ncp_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ncp_compat_ioctl,
@@ -73,10 +73,8 @@ const struct inode_operations ncp_dir_inode_operations =
 * Dentry operations routines
 */
 static int ncp_lookup_validate(struct dentry *, unsigned int);
-static int ncp_hash_dentry(const struct dentry *, const struct inode *,
+static int ncp_hash_dentry(const struct dentry *, struct qstr *);
-                struct qstr *);
+static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
-static int ncp_compare_dentry(const struct dentry *, const struct inode *,
-                const struct dentry *, const struct inode *,
                unsigned int, const char *, const struct qstr *);
 static int ncp_delete_dentry(const struct dentry *);
@@ -119,11 +117,19 @@ static inline int ncp_case_sensitive(const struct inode *i)
 /*
 * Note: leave the hash unchanged if the directory
 * is case-sensitive.
+ *
+ * Accessing the parent inode can be racy under RCU pathwalking.
+ * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
+ * the callers will handle races.
 */
 static int 
-ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
-                struct qstr *this)
 {
+        struct inode *inode = ACCESS_ONCE(dentry->d_inode);
+        if (!inode)
+                return 0;
        if (!ncp_case_sensitive(inode)) {
                struct super_block *sb = dentry->d_sb;
                struct nls_table *t;
@@ -140,14 +146,24 @@ ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
        return 0;
 }
+/*
+ * Accessing the parent inode can be racy under RCU pathwalking.
+ * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
+ * the callers will handle races.
+ */
 static int
-ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
+        struct inode *pinode;
        if (len != name->len)
                return 1;
+        pinode = ACCESS_ONCE(parent->d_inode);
+        if (!pinode)
+                return 1;
        if (ncp_case_sensitive(pinode))
                return strncmp(str, name->name, len);
@@ -424,9 +440,9 @@ static time_t ncp_obtain_mtime(struct dentry *dentry)
        return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
 }
-static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ncp_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        struct page *page = NULL;
        struct ncp_server *server = NCP_SERVER(inode);
@@ -440,7 +456,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
        DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
-                (int) filp->f_pos);
+                (int) ctx->pos);
        result = -EIO;
        /* Do not generate '.' and '..' when server is dead. */
@@ -448,16 +464,8 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
                goto out;
        result = 0;
-        if (filp->f_pos == 0) {
+        if (!dir_emit_dots(file, ctx))
-                if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+                goto out;
-                        goto out;
-                filp->f_pos = 1;
-        }
-        if (filp->f_pos == 1) {
-                if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR))
-                        goto out;
-                filp->f_pos = 2;
-        }
        page = grab_cache_page(&inode->i_data, 0);
        if (!page)
@@ -469,7 +477,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (!PageUptodate(page) || !ctl.head.eof)
                goto init_cache;
-        if (filp->f_pos == 2) {
+        if (ctx->pos == 2) {
                if (jiffies - ctl.head.time >= NCP_MAX_AGE(server))
                        goto init_cache;
@@ -479,10 +487,10 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        goto init_cache;
        }
-        if (filp->f_pos > ctl.head.end)
+        if (ctx->pos > ctl.head.end)
                goto finished;
-        ctl.fpos = filp->f_pos + (NCP_DIRCACHE_START - 2);
+        ctl.fpos = ctx->pos + (NCP_DIRCACHE_START - 2);
        ctl.ofs  = ctl.fpos / NCP_DIRCACHE_SIZE;
        ctl.idx  = ctl.fpos % NCP_DIRCACHE_SIZE;
@@ -497,21 +505,21 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
                }
                while (ctl.idx < NCP_DIRCACHE_SIZE) {
                        struct dentry *dent;
-                        int res;
+                        bool over;
                        dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx],
-                                                dentry, filp->f_pos);
+                                                dentry, ctx->pos);
                        if (!dent)
                                goto invalid_cache;
-                        res = filldir(dirent, dent->d_name.name,
+                        over = !dir_emit(ctx, dent->d_name.name,
-                                        dent->d_name.len, filp->f_pos,
+                                        dent->d_name.len,
                                        dent->d_inode->i_ino, DT_UNKNOWN);
                        dput(dent);
-                        if (res)
+                        if (over)
                                goto finished;
-                        filp->f_pos += 1;
+                        ctx->pos += 1;
                        ctl.idx += 1;
-                        if (filp->f_pos > ctl.head.end)
+                        if (ctx->pos > ctl.head.end)
                                goto finished;
                }
                if (ctl.page) {
@@ -548,9 +556,9 @@ init_cache:
        ctl.valid  = 1;
 read_really:
        if (ncp_is_server_root(inode)) {
-                ncp_read_volume_list(filp, dirent, filldir, &ctl);
+                ncp_read_volume_list(file, ctx, &ctl);
        } else {
-                ncp_do_readdir(filp, dirent, filldir, &ctl);
+                ncp_do_readdir(file, ctx, &ctl);
        }
        ctl.head.end = ctl.fpos - 1;
        ctl.head.eof = ctl.valid;
@@ -573,11 +581,11 @@ out:
 }
 static int
-ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+ncp_fill_cache(struct file *file, struct dir_context *ctx,
                struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
                int inval_childs)
 {
-        struct dentry *newdent, *dentry = filp->f_path.dentry;
+        struct dentry *newdent, *dentry = file->f_path.dentry;
        struct inode *dir = dentry->d_inode;
        struct ncp_cache_control ctl = *ctrl;
        struct qstr qname;
@@ -666,15 +674,13 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 end_advance:
        if (!valid)
                ctl.valid = 0;
-        if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
+        if (!ctl.filled && (ctl.fpos == ctx->pos)) {
-                if (!ino)
-                        ino = find_inode_number(dentry, &qname);
                if (!ino)
                        ino = iunique(dir->i_sb, 2);
-                ctl.filled = filldir(dirent, qname.name, qname.len,
+                ctl.filled = !dir_emit(ctx, qname.name, qname.len,
-                                     filp->f_pos, ino, DT_UNKNOWN);
+                                     ino, DT_UNKNOWN);
                if (!ctl.filled)
-                        filp->f_pos += 1;
+                        ctx->pos += 1;
        }
        ctl.fpos += 1;
        ctl.idx  += 1;
@@ -683,10 +689,10 @@ end_advance:
 }
 static void
-ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
+ncp_read_volume_list(struct file *file, struct dir_context *ctx,
                        struct ncp_cache_control *ctl)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        struct ncp_server *server = NCP_SERVER(inode);
        struct ncp_volume_info info;
@@ -694,7 +700,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
        int i;
        DPRINTK("ncp_read_volume_list: pos=%ld\n",
-                        (unsigned long) filp->f_pos);
+                        (unsigned long) ctx->pos);
        for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
                int inval_dentry;
@@ -715,16 +721,16 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
                }
                inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
                entry.volume = entry.i.volNumber;
-                if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
+                if (!ncp_fill_cache(file, ctx, ctl, &entry, inval_dentry))
                        return;
        }
 }
 static void
-ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
+ncp_do_readdir(struct file *file, struct dir_context *ctx,
                                                struct ncp_cache_control *ctl)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct dentry *dentry = file->f_path.dentry;
        struct inode *dir = dentry->d_inode;
        struct ncp_server *server = NCP_SERVER(dir);
        struct nw_search_sequence seq;
@@ -736,7 +742,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
        DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
-                (unsigned long) filp->f_pos);
+                (unsigned long) ctx->pos);
        PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n",
                dentry->d_name.name, NCP_FINFO(dir)->volNumber,
                NCP_FINFO(dir)->dirEntNum);
@@ -778,7 +784,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
                        rpl += onerpl;
                        rpls -= onerpl;
                        entry.volume = entry.i.volNumber;
-                        if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
+                        if (!ncp_fill_cache(file, ctx, ctl, &entry, 0))
                                break;
                }
        } while (more);
@@ -1131,17 +1137,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
                old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
-        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) {
-                /*
-                 * fail with EBUSY if there are still references to this
-                 * directory.
-                 */
-                dentry_unhash(new_dentry);
-                error = -EBUSY;
-                if (!d_unhashed(new_dentry))
-                        goto out;
-        }
        ncp_age_dentry(server, old_dentry);
        ncp_age_dentry(server, new_dentry);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 26910c8154da..4659da67e7f6 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -403,18 +403,24 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)
                switch (optval) {
                        case 'u':
                                data->uid = make_kuid(current_user_ns(), optint);
-                                if (!uid_valid(data->uid))
+                                if (!uid_valid(data->uid)) {
+                                        ret = -EINVAL;
                                        goto err;
+                                }
                                break;
                        case 'g':
                                data->gid = make_kgid(current_user_ns(), optint);
-                                if (!gid_valid(data->gid))
+                                if (!gid_valid(data->gid)) {
+                                        ret = -EINVAL;
                                        goto err;
+                                }
                                break;
                        case 'o':
                                data->mounted_uid = make_kuid(current_user_ns(), optint);
-                                if (!uid_valid(data->mounted_uid))
+                                if (!uid_valid(data->mounted_uid)) {
+                                        ret = -EINVAL;
                                        goto err;
+                                }
                                break;
                        case 'm':
                                data->file_mode = optint;
@@ -891,6 +897,10 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
        if (!server)    /* How this could happen? */
                goto out;
+        result = -EPERM;
+        if (IS_DEADDIR(dentry->d_inode))
+                goto out;
        /* ageing the dentry to force validation */
        ncp_age_dentry(server, dentry);
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index ee24df5af1f9..3c5dd55d284c 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
                return -EINVAL;
        /* we do not support files bigger than 4GB... We eventually 
           supports just 4GB... */
-        if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff 
+        if (vma_pages(vma) + vma->vm_pgoff
           > (1U << (32 - PAGE_SHIFT)))
                return -EFBIG;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 13ca196385f5..b5e80b0af315 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -104,6 +104,15 @@ config NFS_V4_1
          If unsure, say N.
+config NFS_V4_2
+        bool "NFS client support for NFSv4.2"
+        depends on NFS_V4_1
+        help
+          This option enables support for minor version 2 of the NFSv4 protocol
+          in the kernel's NFS client.
+          If unsure, say N.
 config PNFS_FILE_LAYOUT
        tristate
        depends on NFS_V4_1
@@ -131,6 +140,11 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
          If the NFS client is unchanged from the upstream kernel, this
          option should be set to the default "kernel.org".
+config NFS_V4_SECURITY_LABEL
+        bool
+        depends on NFS_V4_2 && SECURITY
+        default y
 config ROOT_NFS
        bool "Root file system on NFS"
        depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index cce2c057bd2d..e0bb048e9576 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,8 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
 nfs-y                   := client.o dir.o file.o getroot.o inode.o super.o \
                           direct.o pagelist.o read.o symlink.o unlink.o \
-                           write.o namespace.o mount_clnt.o \
+                           write.o namespace.o mount_clnt.o
-                           dns_resolve.o cache_lib.o
 nfs-$(CONFIG_ROOT_NFS)  += nfsroot.o
 nfs-$(CONFIG_SYSCTL)    += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
@@ -22,7 +21,8 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
 obj-$(CONFIG_NFS_V4) += nfsv4.o
 nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
          delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
-          nfs4namespace.o nfs4getroot.o nfs4client.o
+          nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o
+nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
 nfsv4-$(CONFIG_SYSCTL)  += nfs4sysctl.o
 nfsv4-$(CONFIG_NFS_V4_1)        += nfs4session.o pnfs.o pnfs_dev.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 434b93ec0970..e242bbf72972 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1089,9 +1089,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
        dev->pgbase = 0;
        dev->pglen = PAGE_SIZE * max_pages;
        dev->mincount = 0;
+        dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
        dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
-        rc = nfs4_proc_getdeviceinfo(server, dev);
+        rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
        dprintk("%s getdevice info returns %d\n", __func__, rc);
        if (rc) {
                rv = ERR_PTR(rc);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index cff089a412c7..67cd73213168 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -211,7 +211,6 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
        struct svc_rqst *rqstp;
        int (*callback_svc)(void *vrqstp);
        struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
-        char svc_name[12];
        int ret;
        nfs_callback_bc_serv(minorversion, xprt, serv);
@@ -235,10 +234,10 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
        svc_sock_update_bufs(serv);
-        sprintf(svc_name, "nfsv4.%u-svc", minorversion);
        cb_info->serv = serv;
        cb_info->rqst = rqstp;
-        cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name);
+        cb_info->task = kthread_run(callback_svc, cb_info->rqst,
+                                    "nfsv4.%u-svc", minorversion);
        if (IS_ERR(cb_info->task)) {
                ret = PTR_ERR(cb_info->task);
                svc_exit_thread(cb_info->rqst);
@@ -282,6 +281,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
                        ret = nfs4_callback_up_net(serv, net);
                        break;
                case 1:
+                case 2:
                        ret = nfs41_callback_up_net(serv, net);
                        break;
                default:
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index efd54f0a4c46..84326e9fb47a 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -32,6 +32,8 @@ enum nfs4_callback_opnum {
        OP_CB_WANTS_CANCELLED = 12,
        OP_CB_NOTIFY_LOCK   = 13,
        OP_CB_NOTIFY_DEVICEID = 14,
+/* Callback operations new to NFSv4.2 */
+        OP_CB_OFFLOAD = 15,
        OP_CB_ILLEGAL = 10044,
 };
@@ -39,6 +41,7 @@ struct cb_process_state {
        __be32                  drc_status;
        struct nfs_client       *clp;
        u32                     slotid;
+        u32                     minorversion;
        struct net              *net;
 };
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 0bc27684ebfa..e6ebc4c38c81 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -406,7 +406,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        int i;
        __be32 status = htonl(NFS4ERR_BADSESSION);
-        clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid);
+        clp = nfs4_find_client_sessionid(cps->net, args->csa_addr,
+                                         &args->csa_sessionid, cps->minorversion);
        if (clp == NULL)
                goto out;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a35582c9d444..f4ccfe6521ec 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -166,9 +166,9 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
        if (unlikely(p == NULL))
                return htonl(NFS4ERR_RESOURCE);
        hdr->minorversion = ntohl(*p++);
-        /* Check minor version is zero or one. */
+        /* Check for minor version support */
-        if (hdr->minorversion <= 1) {
+        if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) {
-                hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
+                hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */
        } else {
                pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
                        "illegal minor version %u!\n",
@@ -786,6 +786,26 @@ static void nfs4_cb_free_slot(struct cb_process_state *cps)
 }
 #endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+        __be32 status = preprocess_nfs41_op(nop, op_nr, op);
+        if (status != htonl(NFS4ERR_OP_ILLEGAL))
+                return status;
+        if (op_nr == OP_CB_OFFLOAD)
+                return htonl(NFS4ERR_NOTSUPP);
+        return htonl(NFS4ERR_OP_ILLEGAL);
+}
+#else /* CONFIG_NFS_V4_2 */
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+        return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+#endif /* CONFIG_NFS_V4_2 */
 static __be32
 preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 {
@@ -801,8 +821,7 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
        return htonl(NFS_OK);
 }
-static __be32 process_op(uint32_t minorversion, int nop,
+static __be32 process_op(int nop, struct svc_rqst *rqstp,
-                struct svc_rqst *rqstp,
                struct xdr_stream *xdr_in, void *argp,
                struct xdr_stream *xdr_out, void *resp,
                struct cb_process_state *cps)
@@ -819,10 +838,22 @@ static __be32 process_op(uint32_t minorversion, int nop,
                return status;
        dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
-                __func__, minorversion, nop, op_nr);
+                __func__, cps->minorversion, nop, op_nr);
+        switch (cps->minorversion) {
+        case 0:
+                status = preprocess_nfs4_op(op_nr, &op);
+                break;
+        case 1:
+                status = preprocess_nfs41_op(nop, op_nr, &op);
+                break;
+        case 2:
+                status = preprocess_nfs42_op(nop, op_nr, &op);
+                break;
+        default:
+                status = htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+        }
-        status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
-                                preprocess_nfs4_op(op_nr, &op);
        if (status == htonl(NFS4ERR_OP_ILLEGAL))
                op_nr = OP_CB_ILLEGAL;
        if (status)
@@ -885,14 +916,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
                        return rpc_drop_reply;
        }
+        cps.minorversion = hdr_arg.minorversion;
        hdr_res.taglen = hdr_arg.taglen;
        hdr_res.tag = hdr_arg.tag;
        if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
                return rpc_system_err;
        while (status == 0 && nops != hdr_arg.nops) {
-                status = process_op(hdr_arg.minorversion, nops, rqstp,
+                status = process_op(nops, rqstp, &xdr_in,
-                                    &xdr_in, argp, &xdr_out, resp, &cps);
+                                    argp, &xdr_out, resp, &cps);
                nops++;
        }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index c513b0cc835f..340b1eff0267 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -753,8 +753,6 @@ static int nfs_init_server(struct nfs_server *server,
                        data->timeo, data->retrans);
        if (data->flags & NFS_MOUNT_NORESVPORT)
                set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
-        if (server->options & NFS_OPTION_MIGRATION)
-                set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
        /* Allocate or find a client reference we can use */
        clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
@@ -1076,7 +1074,7 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
        }
        if (!(fattr->valid & NFS_ATTR_FATTR)) {
-                error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr);
+                error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL);
                if (error < 0) {
                        dprintk("nfs_create_server: getattr error = %d\n", -error);
                        goto error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 57db3244f4d9..7ec4814e298d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -73,20 +73,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
        if (inode->i_flock == NULL)
                goto out;
-        /* Protect inode->i_flock using the file locks lock */
+        /* Protect inode->i_flock using the i_lock */
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
                if (nfs_file_open_context(fl->fl_file) != ctx)
                        continue;
-                unlock_flocks();
+                spin_unlock(&inode->i_lock);
                status = nfs4_lock_delegation_recall(fl, state, stateid);
                if (status < 0)
                        goto out;
-                lock_flocks();
+                spin_lock(&inode->i_lock);
        }
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
 out:
        return status;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e093e73178b7..e474ca2b2bfe 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,6 +33,7 @@
 #include <linux/pagevec.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/swap.h>
 #include <linux/sched.h>
 #include <linux/kmemleak.h>
 #include <linux/xattr.h>
@@ -46,7 +47,7 @@
 static int nfs_opendir(struct inode *, struct file *);
 static int nfs_closedir(struct inode *, struct file *);
-static int nfs_readdir(struct file *, void *, filldir_t);
+static int nfs_readdir(struct file *, struct dir_context *);
 static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 static void nfs_readdir_clear_array(struct page*);
@@ -54,7 +55,7 @@ static void nfs_readdir_clear_array(struct page*);
 const struct file_operations nfs_dir_operations = {
        .llseek         = nfs_llseek_dir,
        .read           = generic_read_dir,
-        .readdir        = nfs_readdir,
+        .iterate        = nfs_readdir,
        .open           = nfs_opendir,
        .release        = nfs_closedir,
        .fsync          = nfs_fsync_dir,
@@ -147,6 +148,7 @@ typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
 typedef struct {
        struct file     *file;
        struct page     *page;
+        struct dir_context *ctx;
        unsigned long   page_index;
        u64             *dir_cookie;
        u64             last_cookie;
@@ -252,7 +254,7 @@ out:
 static
 int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
 {
-        loff_t diff = desc->file->f_pos - desc->current_index;
+        loff_t diff = desc->ctx->pos - desc->current_index;
        unsigned int index;
        if (diff < 0)
@@ -289,7 +291,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
                            || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
                                ctx->duped = 0;
                                ctx->attr_gencount = nfsi->attr_gencount;
-                        } else if (new_pos < desc->file->f_pos) {
+                        } else if (new_pos < desc->ctx->pos) {
                                if (ctx->duped > 0
                                    && ctx->dup_cookie == *desc->dir_cookie) {
                                        if (printk_ratelimit()) {
@@ -307,7 +309,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
                                ctx->dup_cookie = *desc->dir_cookie;
                                ctx->duped = -1;
                        }
-                        desc->file->f_pos = new_pos;
+                        desc->ctx->pos = new_pos;
                        desc->cache_entry_index = i;
                        return 0;
                }
@@ -405,13 +407,13 @@ different:
 }
 static
-bool nfs_use_readdirplus(struct inode *dir, struct file *filp)
+bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
 {
        if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
                return false;
        if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
                return true;
-        if (filp->f_pos == 0)
+        if (ctx->pos == 0)
                return true;
        return false;
 }
@@ -435,6 +437,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
        struct dentry *alias;
        struct inode *dir = parent->d_inode;
        struct inode *inode;
+        int status;
        if (filename.name[0] == '.') {
                if (filename.len == 1)
@@ -447,7 +450,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
        dentry = d_lookup(parent, &filename);
        if (dentry != NULL) {
                if (nfs_same_file(dentry, entry)) {
-                        nfs_refresh_inode(dentry->d_inode, entry->fattr);
+                        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+                        status = nfs_refresh_inode(dentry->d_inode, entry->fattr);
+                        if (!status)
+                                nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);
                        goto out;
                } else {
                        if (d_invalidate(dentry) != 0)
@@ -460,7 +466,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
        if (dentry == NULL)
                return;
-        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
        if (IS_ERR(inode))
                goto out;
@@ -585,10 +591,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        if (entry.fh == NULL || entry.fattr == NULL)
                goto out;
+        entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+        if (IS_ERR(entry.label)) {
+                status = PTR_ERR(entry.label);
+                goto out;
+        }
        array = nfs_readdir_get_array(page);
        if (IS_ERR(array)) {
                status = PTR_ERR(array);
-                goto out;
+                goto out_label_free;
        }
        memset(array, 0, sizeof(struct nfs_cache_array));
        array->eof_index = -1;
@@ -614,6 +626,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        nfs_readdir_free_large_page(pages_ptr, pages, array_size);
 out_release_array:
        nfs_readdir_release_array(page);
+out_label_free:
+        nfs4_label_free(entry.label);
 out:
        nfs_free_fattr(entry.fattr);
        nfs_free_fhandle(entry.fh);
@@ -702,8 +716,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 * Once we've found the start of the dirent within a page: fill 'er up...
 */
 static 
-int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
+int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
-                   filldir_t filldir)
 {
        struct file     *file = desc->file;
        int i = 0;
@@ -721,13 +734,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
                struct nfs_cache_array_entry *ent;
                ent = &array->array[i];
-                if (filldir(dirent, ent->string.name, ent->string.len,
+                if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
-                    file->f_pos, nfs_compat_user_ino64(ent->ino),
+                    nfs_compat_user_ino64(ent->ino), ent->d_type)) {
-                    ent->d_type) < 0) {
                        desc->eof = 1;
                        break;
                }
-                file->f_pos++;
+                desc->ctx->pos++;
                if (i < (array->size-1))
                        *desc->dir_cookie = array->array[i+1].cookie;
                else
@@ -759,8 +771,7 @@ out:
 *       directory in the page cache by the time we get here.
 */
 static inline
-int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
+int uncached_readdir(nfs_readdir_descriptor_t *desc)
-                     filldir_t filldir)
 {
        struct page     *page = NULL;
        int             status;
@@ -785,7 +796,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
        if (status < 0)
                goto out_release;
-        status = nfs_do_filldir(desc, dirent, filldir);
+        status = nfs_do_filldir(desc);
 out:
        dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
@@ -800,35 +811,37 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
   last cookie cache takes care of the common case of reading the
   whole directory.
 */
-static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int nfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry   *dentry = filp->f_path.dentry;
+        struct dentry   *dentry = file->f_path.dentry;
        struct inode    *inode = dentry->d_inode;
        nfs_readdir_descriptor_t my_desc,
                        *desc = &my_desc;
-        struct nfs_open_dir_context *dir_ctx = filp->private_data;
+        struct nfs_open_dir_context *dir_ctx = file->private_data;
-        int res;
+        int res = 0;
        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
-                        (long long)filp->f_pos);
+                        (long long)ctx->pos);
        nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
        /*
-         * filp->f_pos points to the dirent entry number.
+         * ctx->pos points to the dirent entry number.
         * *desc->dir_cookie has the cookie for the next entry. We have
         * to either find the entry with the appropriate number or
         * revalidate the cookie.
         */
        memset(desc, 0, sizeof(*desc));
-        desc->file = filp;
+        desc->file = file;
+        desc->ctx = ctx;
        desc->dir_cookie = &dir_ctx->dir_cookie;
        desc->decode = NFS_PROTO(inode)->decode_dirent;
-        desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0;
+        desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
        nfs_block_sillyrename(dentry);
-        res = nfs_revalidate_mapping(inode, filp->f_mapping);
+        if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
+                res = nfs_revalidate_mapping(inode, file->f_mapping);
        if (res < 0)
                goto out;
@@ -840,7 +853,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        /* This means either end of directory */
                        if (*desc->dir_cookie && desc->eof == 0) {
                                /* Or that the server has 'lost' a cookie */
-                                res = uncached_readdir(desc, dirent, filldir);
+                                res = uncached_readdir(desc);
                                if (res == 0)
                                        continue;
                        }
@@ -857,7 +870,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                if (res < 0)
                        break;
-                res = nfs_do_filldir(desc, dirent, filldir);
+                res = nfs_do_filldir(desc);
                if (res < 0)
                        break;
        } while (!desc->eof);
@@ -1040,6 +1053,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
        struct dentry *parent;
        struct nfs_fh *fhandle = NULL;
        struct nfs_fattr *fattr = NULL;
+        struct nfs4_label *label = NULL;
        int error;
        if (flags & LOOKUP_RCU)
@@ -1082,7 +1096,11 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
        if (fhandle == NULL || fattr == NULL)
                goto out_error;
-        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+        label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+        if (IS_ERR(label))
+                goto out_error;
+        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
        if (error)
                goto out_bad;
        if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1090,8 +1108,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
        if ((error = nfs_refresh_inode(inode, fattr)) != 0)
                goto out_bad;
+        nfs_setsecurity(inode, fattr, label);
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
+        nfs4_label_free(label);
 out_set_verifier:
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_valid:
@@ -1108,6 +1130,7 @@ out_zap_parent:
 out_bad:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
+        nfs4_label_free(label);
        nfs_mark_for_revalidate(dir);
        if (inode && S_ISDIR(inode->i_mode)) {
                /* Purge readdir caches. */
@@ -1128,6 +1151,7 @@ out_zap_parent:
 out_error:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
+        nfs4_label_free(label);
        dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
                        __func__, dentry->d_parent->d_name.name,
@@ -1256,6 +1280,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
        struct inode *inode = NULL;
        struct nfs_fh *fhandle = NULL;
        struct nfs_fattr *fattr = NULL;
+        struct nfs4_label *label = NULL;
        int error;
        dfprintk(VFS, "NFS: lookup(%s/%s)\n",
@@ -1282,17 +1307,21 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
        if (fhandle == NULL || fattr == NULL)
                goto out;
+        label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT);
+        if (IS_ERR(label))
+                goto out;
        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
        nfs_block_sillyrename(parent);
-        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
        if (error == -ENOENT)
                goto no_entry;
        if (error < 0) {
                res = ERR_PTR(error);
                goto out_unblock_sillyrename;
        }
-        inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+        inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
        res = ERR_CAST(inode);
        if (IS_ERR(res))
                goto out_unblock_sillyrename;
@@ -1310,6 +1339,7 @@ no_entry:
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_unblock_sillyrename:
        nfs_unblock_sillyrename(parent);
+        nfs4_label_free(label);
 out:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
@@ -1357,18 +1387,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
 {
        int err;
-        if (ctx->dentry != dentry) {
-                dput(ctx->dentry);
-                ctx->dentry = dget(dentry);
-        }
-        /* If the open_intent is for execute, we have an extra check to make */
-        if (ctx->mode & FMODE_EXEC) {
-                err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags);
-                if (err < 0)
-                        goto out;
-        }
        err = finish_open(file, dentry, do_open, opened);
        if (err)
                goto out;
@@ -1427,13 +1445,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
        nfs_block_sillyrename(dentry->d_parent);
        inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
-        d_drop(dentry);
+        nfs_unblock_sillyrename(dentry->d_parent);
        if (IS_ERR(inode)) {
-                nfs_unblock_sillyrename(dentry->d_parent);
                put_nfs_open_context(ctx);
                err = PTR_ERR(inode);
                switch (err) {
                case -ENOENT:
+                        d_drop(dentry);
                        d_add(dentry, NULL);
                        break;
                case -EISDIR:
@@ -1449,16 +1467,8 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
                }
                goto out;
        }
-        res = d_add_unique(dentry, inode);
-        if (res != NULL)
-                dentry = res;
-        nfs_unblock_sillyrename(dentry->d_parent);
-        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-        err = nfs_finish_open(ctx, dentry, file, open_flags, opened);
-        dput(res);
+        err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened);
 out:
        return err;
@@ -1528,7 +1538,8 @@ no_open:
 * Code common to create, mkdir, and mknod.
 */
 int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
-                                struct nfs_fattr *fattr)
+                                struct nfs_fattr *fattr,
+                                struct nfs4_label *label)
 {
        struct dentry *parent = dget_parent(dentry);
        struct inode *dir = parent->d_inode;
@@ -1541,18 +1552,18 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
        if (dentry->d_inode)
                goto out;
        if (fhandle->size == 0) {
-                error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+                error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
                if (error)
                        goto out_error;
        }
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        if (!(fattr->valid & NFS_ATTR_FATTR)) {
                struct nfs_server *server = NFS_SB(dentry->d_sb);
-                error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
+                error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL);
                if (error < 0)
                        goto out_error;
        }
-        inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+        inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
        error = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_error;
@@ -1721,7 +1732,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
                dir->i_ino, dentry->d_name.name);
        spin_lock(&dentry->d_lock);
-        if (dentry->d_count > 1) {
+        if (d_count(dentry) > 1) {
                spin_unlock(&dentry->d_lock);
                /* Start asynchronous writeout of the inode */
                write_inode_now(dentry->d_inode, 0);
@@ -1759,7 +1770,6 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
 */
 int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
-        struct pagevec lru_pvec;
        struct page *page;
        char *kaddr;
        struct iattr attr;
@@ -1799,11 +1809,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
         * No big deal if we can't add this page to the page cache here.
         * READLINK will get the missing page from the server if needed.
         */
-        pagevec_init(&lru_pvec, 0);
+        if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0,
-        if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
                                                        GFP_KERNEL)) {
-                pagevec_add(&lru_pvec, page);
-                pagevec_lru_add_file(&lru_pvec);
                SetPageUptodate(page);
                unlock_page(page);
        } else
@@ -1870,7 +1877,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
                 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
-                 new_dentry->d_count);
+                 d_count(new_dentry));
        /*
         * For non-directories, check whether the target is busy and if so,
@@ -1888,7 +1895,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        rehash = new_dentry;
                }
-                if (new_dentry->d_count > 2) {
+                if (d_count(new_dentry) > 2) {
                        int err;
                        /* copy the target dentry's name */
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 945527092295..fc0f95ec7358 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -29,7 +29,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
        kfree(ip_addr);
        return ret;
 }
-EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
 #else
@@ -351,7 +350,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,
                ret = -ESRCH;
        return ret;
 }
-EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
 static struct cache_detail nfs_dns_resolve_template = {
        .owner          = THIS_MODULE,
@@ -396,6 +394,21 @@ void nfs_dns_resolver_cache_destroy(struct net *net)
        cache_destroy_net(nn->nfs_dns_resolve, net);
 }
+static int nfs4_dns_net_init(struct net *net)
+{
+        return nfs_dns_resolver_cache_init(net);
+}
+static void nfs4_dns_net_exit(struct net *net)
+{
+        nfs_dns_resolver_cache_destroy(net);
+}
+static struct pernet_operations nfs4_dns_resolver_ops = {
+        .init = nfs4_dns_net_init,
+        .exit = nfs4_dns_net_exit,
+};
 static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
                           void *ptr)
 {
@@ -432,11 +445,24 @@ static struct notifier_block nfs_dns_resolver_block = {
 int nfs_dns_resolver_init(void)
 {
-        return rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+        int err;
+        err = register_pernet_subsys(&nfs4_dns_resolver_ops);
+        if (err < 0)
+                goto out;
+        err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+        if (err < 0)
+                goto out1;
+        return 0;
+out1:
+        unregister_pernet_subsys(&nfs4_dns_resolver_ops);
+out:
+        return err;
 }
 void nfs_dns_resolver_destroy(void)
 {
        rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
+        unregister_pernet_subsys(&nfs4_dns_resolver_ops);
 }
 #endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a87a44f84113..94e94bd11aae 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -451,11 +451,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 * - Called if either PG_private or PG_fscache is set on the page
 * - Caller holds page lock
 */
-static void nfs_invalidate_page(struct page *page, unsigned long offset)
+static void nfs_invalidate_page(struct page *page, unsigned int offset,
+                                unsigned int length)
 {
-        dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+        dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
+                 page, offset, length);
-        if (offset != 0)
+        if (offset != 0 || length < PAGE_CACHE_SIZE)
                return;
        /* Cancel any unstarted writes on this page */
        nfs_wb_page_cancel(page_file_mapping(page)->host, page);
@@ -493,6 +495,35 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
        return nfs_fscache_release_page(page, gfp);
 }
+static void nfs_check_dirty_writeback(struct page *page,
+                                bool *dirty, bool *writeback)
+{
+        struct nfs_inode *nfsi;
+        struct address_space *mapping = page_file_mapping(page);
+        if (!mapping || PageSwapCache(page))
+                return;
+        /*
+         * Check if an unstable page is currently being committed and
+         * if so, have the VM treat it as if the page is under writeback
+         * so it will not block due to pages that will shortly be freeable.
+         */
+        nfsi = NFS_I(mapping->host);
+        if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
+                *writeback = true;
+                return;
+        }
+        /*
+         * If PagePrivate() is set, then the page is not freeable and as the
+         * inode is not being committed, it's not going to be cleaned in the
+         * near future so treat it as dirty
+         */
+        if (PagePrivate(page))
+                *dirty = true;
+}
 /*
 * Attempt to clear the private state associated with a page when an error
 * occurs that requires the cached contents of an inode to be written back or
@@ -540,6 +571,7 @@ const struct address_space_operations nfs_file_aops = {
        .direct_IO = nfs_direct_IO,
        .migratepage = nfs_migrate_page,
        .launder_page = nfs_launder_page,
+        .is_dirty_writeback = nfs_check_dirty_writeback,
        .error_remove_page = generic_error_remove_page,
 #ifdef CONFIG_NFS_SWAP
        .swap_activate = nfs_swap_activate,
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 44efaa8c5f78..66984a9aafaa 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -95,7 +95,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
                goto out;
        }
-        inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
+        inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL);
        if (IS_ERR(inode)) {
                dprintk("nfs_get_root: get root inode failed\n");
                ret = ERR_CAST(inode);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index c516da5873fd..c2c4163d5683 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -262,29 +262,42 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
        return desclen;
 }
-static ssize_t nfs_idmap_request_key(struct key_type *key_type,
+static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
-                                     const char *name, size_t namelen,
+                                         const char *type, struct idmap *idmap)
-                                     const char *type, void *data,
-                                     size_t data_size, struct idmap *idmap)
 {
-        const struct cred *saved_cred;
-        struct key *rkey;
        char *desc;
-        struct user_key_payload *payload;
+        struct key *rkey;
        ssize_t ret;
        ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
        if (ret <= 0)
-                goto out;
+                return ERR_PTR(ret);
+        rkey = request_key(&key_type_id_resolver, desc, "");
+        if (IS_ERR(rkey)) {
+                mutex_lock(&idmap->idmap_mutex);
+                rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
+                                                desc, "", 0, idmap);
+                mutex_unlock(&idmap->idmap_mutex);
+        }
+        kfree(desc);
+        return rkey;
+}
+static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
+                                 const char *type, void *data,
+                                 size_t data_size, struct idmap *idmap)
+{
+        const struct cred *saved_cred;
+        struct key *rkey;
+        struct user_key_payload *payload;
+        ssize_t ret;
        saved_cred = override_creds(id_resolver_cache);
-        if (idmap)
+        rkey = nfs_idmap_request_key(name, namelen, type, idmap);
-                rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap);
-        else
-                rkey = request_key(&key_type_id_resolver, desc, "");
        revert_creds(saved_cred);
-        kfree(desc);
        if (IS_ERR(rkey)) {
                ret = PTR_ERR(rkey);
                goto out;
@@ -316,23 +329,6 @@ out:
        return ret;
 }
-static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
-                                 const char *type, void *data,
-                                 size_t data_size, struct idmap *idmap)
-{
-        ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver,
-                                            name, namelen, type, data,
-                                            data_size, NULL);
-        if (ret < 0) {
-                mutex_lock(&idmap->idmap_mutex);
-                ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
-                                            name, namelen, type, data,
-                                            data_size, idmap);
-                mutex_unlock(&idmap->idmap_mutex);
-        }
-        return ret;
-}
 /* ID -> Name */
 static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
                                     size_t buflen, struct idmap *idmap)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c1c7a9d78722..941246f2b43d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,7 +48,6 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
-#include "dns_resolve.h"
 #include "pnfs.h"
 #include "nfs.h"
 #include "netns.h"
@@ -79,7 +78,7 @@ int nfs_wait_bit_killable(void *word)
 {
        if (fatal_signal_pending(current))
                return -ERESTARTSYS;
-        freezable_schedule();
+        freezable_schedule_unsafe();
        return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
@@ -162,11 +161,19 @@ static void nfs_zap_caches_locked(struct inode *inode)
        memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
        if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
-                nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
                nfs_fscache_invalidate(inode);
-        } else {
+                nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-                nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
+                                        | NFS_INO_INVALID_LABEL
-        }
+                                        | NFS_INO_INVALID_DATA
+                                        | NFS_INO_INVALID_ACCESS
+                                        | NFS_INO_INVALID_ACL
+                                        | NFS_INO_REVAL_PAGECACHE;
+        } else
+                nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+                                        | NFS_INO_INVALID_LABEL
+                                        | NFS_INO_INVALID_ACCESS
+                                        | NFS_INO_INVALID_ACL
+                                        | NFS_INO_REVAL_PAGECACHE;
 }
 void nfs_zap_caches(struct inode *inode)
@@ -257,12 +264,72 @@ nfs_init_locked(struct inode *inode, void *opaque)
        return 0;
 }
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+                                        struct nfs4_label *label)
+{
+        int error;
+        if (label == NULL)
+                return;
+        if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0)
+                return;
+        if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2)
+                return;
+        if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
+                error = security_inode_notifysecctx(inode, label->label,
+                                label->len);
+                if (error)
+                        printk(KERN_ERR "%s() %s %d "
+                                        "security_inode_notifysecctx() %d\n",
+                                        __func__,
+                                        (char *)label->label,
+                                        label->len, error);
+        }
+}
+struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
+{
+        struct nfs4_label *label = NULL;
+        int minor_version = server->nfs_client->cl_minorversion;
+        if (minor_version < 2)
+                return label;
+        if (!(server->caps & NFS_CAP_SECURITY_LABEL))
+                return label;
+        label = kzalloc(sizeof(struct nfs4_label), flags);
+        if (label == NULL)
+                return ERR_PTR(-ENOMEM);
+        label->label = kzalloc(NFS4_MAXLABELLEN, flags);
+        if (label->label == NULL) {
+                kfree(label);
+                return ERR_PTR(-ENOMEM);
+        }
+        label->len = NFS4_MAXLABELLEN;
+        return label;
+}
+EXPORT_SYMBOL_GPL(nfs4_label_alloc);
+#else
+void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+                                        struct nfs4_label *label)
+{
+}
+#endif
+EXPORT_SYMBOL_GPL(nfs_setsecurity);
 /*
 * This is our front-end to iget that looks up inodes by file handle
 * instead of inode number.
 */
 struct inode *
-nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct nfs_find_desc desc = {
                .fh     = fh,
@@ -384,6 +451,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                         */
                        inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
                }
+                nfs_setsecurity(inode, fattr, label);
                nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
                nfsi->attrtimeo_timestamp = now;
                nfsi->access_cache = RB_ROOT;
@@ -449,7 +519,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
                NFS_PROTO(inode)->return_delegation(inode);
        error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
        if (error == 0)
-                nfs_refresh_inode(inode, fattr);
+                error = nfs_refresh_inode(inode, fattr);
        nfs_free_fattr(fattr);
 out:
        return error;
@@ -713,16 +783,23 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context);
 * Ensure that mmap has a recent RPC credential for use when writing out
 * shared pages
 */
-void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = ctx->dentry->d_inode;
        struct nfs_inode *nfsi = NFS_I(inode);
-        filp->private_data = get_nfs_open_context(ctx);
        spin_lock(&inode->i_lock);
        list_add(&ctx->list, &nfsi->open_files);
        spin_unlock(&inode->i_lock);
 }
+EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+{
+        filp->private_data = get_nfs_open_context(ctx);
+        if (list_empty(&ctx->list))
+                nfs_inode_attach_open_context(ctx);
+}
 EXPORT_SYMBOL_GPL(nfs_file_set_open_context);
 /*
@@ -748,10 +825,11 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
 static void nfs_file_clear_open_context(struct file *filp)
 {
-        struct inode *inode = file_inode(filp);
        struct nfs_open_context *ctx = nfs_file_open_context(filp);
        if (ctx) {
+                struct inode *inode = ctx->dentry->d_inode;
                filp->private_data = NULL;
                spin_lock(&inode->i_lock);
                list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -790,6 +868,7 @@ int
 __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
        int              status = -ESTALE;
+        struct nfs4_label *label = NULL;
        struct nfs_fattr *fattr = NULL;
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -807,7 +886,14 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                goto out;
        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-        status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
+        label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+        if (IS_ERR(label)) {
+                status = PTR_ERR(label);
+                goto out;
+        }
+        status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
        if (status != 0) {
                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
                         inode->i_sb->s_id,
@@ -817,7 +903,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                        if (!S_ISDIR(inode->i_mode))
                                set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
                }
-                goto out;
+                goto err_out;
        }
        status = nfs_refresh_inode(inode, fattr);
@@ -825,7 +911,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
                         inode->i_sb->s_id,
                         (long long)NFS_FILEID(inode), status);
-                goto out;
+                goto err_out;
        }
        if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
@@ -835,7 +921,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                inode->i_sb->s_id,
                (long long)NFS_FILEID(inode));
- out:
+err_out:
+        nfs4_label_free(label);
+out:
        nfs_free_fattr(fattr);
        return status;
 }
@@ -847,7 +935,7 @@ int nfs_attribute_timeout(struct inode *inode)
        return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
-static int nfs_attribute_cache_expired(struct inode *inode)
+int nfs_attribute_cache_expired(struct inode *inode)
 {
        if (nfs_have_delegated_attributes(inode))
                return 0;
@@ -863,7 +951,8 @@ static int nfs_attribute_cache_expired(struct inode *inode)
 */
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
-        if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
+        if (!(NFS_I(inode)->cache_validity &
+                        (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
                        && !nfs_attribute_cache_expired(inode))
                return NFS_STALE(inode) ? -ESTALE : 0;
        return __nfs_revalidate_inode(server, inode);
@@ -873,9 +962,15 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
 static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        
+        int ret;
        if (mapping->nrpages != 0) {
-                int ret = invalidate_inode_pages2(mapping);
+                if (S_ISREG(inode->i_mode)) {
+                        ret = nfs_sync_mapping(mapping);
+                        if (ret < 0)
+                                return ret;
+                }
+                ret = invalidate_inode_pages2(mapping);
                if (ret < 0)
                        return ret;
        }
@@ -1243,6 +1338,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        spin_lock(&inode->i_lock);
        status = nfs_post_op_update_inode_locked(inode, fattr);
        spin_unlock(&inode->i_lock);
        return status;
 }
 EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
@@ -1483,7 +1579,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                inode->i_blocks = fattr->du.nfs2.blocks;
        /* Update attrtimeo value if we're out of the unstable period */
-        if (invalid & NFS_INO_INVALID_ATTR) {
+        if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) {
                nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
                nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
                nfsi->attrtimeo_timestamp = now;
@@ -1496,6 +1592,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                }
        }
        invalid &= ~NFS_INO_INVALID_ATTR;
+        invalid &= ~NFS_INO_INVALID_LABEL;
        /* Don't invalidate the data if we were to blame */
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
                                || S_ISLNK(inode->i_mode)))
@@ -1638,12 +1735,11 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
 static int nfs_net_init(struct net *net)
 {
        nfs_clients_init(net);
-        return nfs_dns_resolver_cache_init(net);
+        return 0;
 }
 static void nfs_net_exit(struct net *net)
 {
-        nfs_dns_resolver_cache_destroy(net);
        nfs_cleanup_cb_ident_idr(net);
 }
@@ -1661,10 +1757,6 @@ static int __init init_nfs_fs(void)
 {
        int err;
-        err = nfs_dns_resolver_init();
-        if (err < 0)
-                goto out10;;
        err = register_pernet_subsys(&nfs_net_ops);
        if (err < 0)
                goto out9;
@@ -1730,8 +1822,6 @@ out7:
 out8:
        unregister_pernet_subsys(&nfs_net_ops);
 out9:
-        nfs_dns_resolver_destroy();
-out10:
        return err;
 }
@@ -1744,7 +1834,6 @@ static void __exit exit_nfs_fs(void)
        nfs_destroy_nfspagecache();
        nfs_fscache_unregister();
        unregister_pernet_subsys(&nfs_net_ops);
-        nfs_dns_resolver_destroy();
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister(&init_net, "nfs");
 #endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 91e59a39fc08..3c8373f90ab3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -165,7 +165,7 @@ extern void nfs_free_client(struct nfs_client *);
 extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
 extern struct nfs_client *
 nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
-                                struct nfs4_sessionid *);
+                                struct nfs4_sessionid *, u32);
 extern struct nfs_server *nfs_create_server(struct nfs_mount_info *,
                                        struct nfs_subversion *);
 extern struct nfs_server *nfs4_create_server(
@@ -255,6 +255,7 @@ extern int nfs4_decode_dirent(struct xdr_stream *,
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
 extern const u32 nfs41_maxwrite_overhead;
+extern const u32 nfs41_maxgetdevinfo_overhead;
 #endif
 /* nfs4proc.c */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 91a6faf811ac..99a45283b9ee 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -139,7 +139,10 @@ struct mnt_fhstatus {
 * nfs_mount - Obtain an NFS file handle for the given host and path
 * @info: pointer to mount request arguments
 *
- * Uses default timeout parameters specified by underlying transport.
+ * Uses default timeout parameters specified by underlying transport. On
+ * successful return, the auth_flavs list and auth_flav_len will be populated
+ * with the list from the server or a faked-up list if the server didn't
+ * provide one.
 */
 int nfs_mount(struct nfs_mount_request *info)
 {
@@ -195,6 +198,15 @@ int nfs_mount(struct nfs_mount_request *info)
        dprintk("NFS: MNT request succeeded\n");
        status = 0;
+        /*
+         * If the server didn't provide a flavor list, allow the
+         * client to try any flavor.
+         */
+        if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) {
+                dprintk("NFS: Faking up auth_flavs list\n");
+                info->auth_flavs[0] = RPC_AUTH_NULL;
+                *info->auth_flav_len = 1;
+        }
 out:
        return status;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index fc8dc20fdeb9..348b535cd786 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -280,7 +280,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
        struct dentry *parent = dget_parent(dentry);
        /* Look it up again to get its attributes */
-        err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr);
+        err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL);
        dput(parent);
        if (err != 0)
                return ERR_PTR(err);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 43ea96ced28c..f5c84c3efbca 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -33,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
                res = rpc_call_sync(clnt, msg, flags);
                if (res != -EJUKEBOX)
                        break;
-                freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+                freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
                res = -ERESTARTSYS;
        } while (!fatal_signal_pending(current));
        return res;
@@ -98,7 +98,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 */
 static int
 nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
-                struct nfs_fattr *fattr)
+                struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_GETATTR],
@@ -143,7 +143,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 static int
 nfs3_proc_lookup(struct inode *dir, struct qstr *name,
-                 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+                 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+                 struct nfs4_label *label)
 {
        struct nfs3_diropargs   arg = {
                .fh             = NFS_FH(dir),
@@ -300,7 +301,7 @@ static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_
        status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
        nfs_post_op_update_inode(dir, data->res.dir_attr);
        if (status == 0)
-                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
        return status;
 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a1dd768d0a35..ee81e354bce7 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -194,7 +194,7 @@ struct nfs4_state_recovery_ops {
        int (*recover_lock)(struct nfs4_state *, struct file_lock *);
        int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
        struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
-        int (*reclaim_complete)(struct nfs_client *);
+        int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
        int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
                struct rpc_cred *);
 };
@@ -303,10 +303,10 @@ is_ds_client(struct nfs_client *clp)
 extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
 extern const u32 nfs4_fattr_bitmap[3];
-extern const u32 nfs4_statfs_bitmap[2];
+extern const u32 nfs4_statfs_bitmap[3];
-extern const u32 nfs4_pathconf_bitmap[2];
+extern const u32 nfs4_pathconf_bitmap[3];
 extern const u32 nfs4_fsinfo_bitmap[3];
-extern const u32 nfs4_fs_locations_bitmap[2];
+extern const u32 nfs4_fs_locations_bitmap[3];
 void nfs4_free_client(struct nfs_client *);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 4cbad5d6b276..90dce91dd5b5 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -66,6 +66,11 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
        if (err)
                goto error;
+        if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) {
+                err = -EINVAL;
+                goto error;
+        }
        spin_lock_init(&clp->cl_lock);
        INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
        rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -562,14 +567,14 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
 */
 struct nfs_client *
 nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
-                           struct nfs4_sessionid *sid)
+                           struct nfs4_sessionid *sid, u32 minorversion)
 {
        struct nfs_client *clp;
        struct nfs_net *nn = net_generic(net, nfs_net_id);
        spin_lock(&nn->nfs_client_lock);
        list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
-                if (nfs4_cb_match_client(addr, clp, 1) == false)
+                if (nfs4_cb_match_client(addr, clp, minorversion) == false)
                        continue;
                if (!nfs4_has_session(clp))
@@ -592,7 +597,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
 struct nfs_client *
 nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
-                           struct nfs4_sessionid *sid)
+                           struct nfs4_sessionid *sid, u32 minorversion)
 {
        return NULL;
 }
@@ -626,6 +631,8 @@ static int nfs4_set_client(struct nfs_server *server,
        if (server->flags & NFS_MOUNT_NORESVPORT)
                set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+        if (server->options & NFS_OPTION_MIGRATION)
+                set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
        /* Allocate or find a client reference we can use */
        clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
@@ -730,7 +737,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
                return -ENOMEM;
        /* We must ensure the session is initialised first */
-        error = nfs4_init_session(server);
+        error = nfs4_init_session(server->nfs_client);
        if (error < 0)
                goto out;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 13e6bb3e3fe5..e5b804dd944c 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -69,7 +69,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
                        goto out_drop;
                }
        }
-        iput(inode);
        if (inode != dentry->d_inode)
                goto out_drop;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 22d10623f5ee..17ed87ef9de8 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -643,7 +643,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
                                   NFS_SERVER(lo->plh_inode)->nfs_client, id);
        if (d == NULL) {
-                dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags);
+                dsaddr = filelayout_get_device_info(lo->plh_inode, id,
+                                lo->plh_lc_cred, gfp_flags);
                if (dsaddr == NULL)
                        goto out;
        } else
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 235ff952d3c8..cebd20e7e923 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -150,6 +150,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
 extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
+filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
+                struct rpc_cred *cred, gfp_t gfp_flags);
 #endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 661a0f611215..95604f64cab8 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -668,7 +668,10 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
 * of available devices, and return it.
 */
 struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags)
+filelayout_get_device_info(struct inode *inode,
+                struct nfs4_deviceid *dev_id,
+                struct rpc_cred *cred,
+                gfp_t gfp_flags)
 {
        struct pnfs_device *pdev = NULL;
        u32 max_resp_sz;
@@ -708,8 +711,9 @@ filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gf
        pdev->pgbase = 0;
        pdev->pglen = max_resp_sz;
        pdev->mincount = 0;
+        pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-        rc = nfs4_proc_getdeviceinfo(server, pdev);
+        rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
        dprintk("%s getdevice info returns %d\n", __func__, rc);
        if (rc)
                goto out_free;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d7ba5616989c..108a774095f7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,15 +77,68 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
 static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
-static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *);
+static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                            struct nfs_fattr *fattr, struct iattr *sattr,
-                            struct nfs4_state *state);
+                            struct nfs4_state *state, struct nfs4_label *ilabel,
+                            struct nfs4_label *olabel);
 #ifdef CONFIG_NFS_V4_1
-static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *);
+static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
-static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *);
+                struct rpc_cred *);
+static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
+                struct rpc_cred *);
 #endif
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+        struct iattr *sattr, struct nfs4_label *label)
+{
+        int err;
+        if (label == NULL)
+                return NULL;
+        if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
+                return NULL;
+        if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2)
+                return NULL;
+        err = security_dentry_init_security(dentry, sattr->ia_mode,
+                                &dentry->d_name, (void **)&label->label, &label->len);
+        if (err == 0)
+                return label;
+        return NULL;
+}
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{
+        if (label)
+                security_release_secctx(label->label, label->len);
+}
+static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{
+        if (label)
+                return server->attr_bitmask;
+        return server->attr_bitmask_nl;
+}
+#else
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+        struct iattr *sattr, struct nfs4_label *l)
+{ return NULL; }
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{ return; }
+static inline u32 *
+nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{ return server->attr_bitmask; }
+#endif
 /* Prevent leaks of NFSv4 errors into userland */
 static int nfs4_map_errors(int err)
 {
@@ -134,7 +187,10 @@ const u32 nfs4_fattr_bitmap[3] = {
        | FATTR4_WORD1_SPACE_USED
        | FATTR4_WORD1_TIME_ACCESS
        | FATTR4_WORD1_TIME_METADATA
-        | FATTR4_WORD1_TIME_MODIFY
+        | FATTR4_WORD1_TIME_MODIFY,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+        FATTR4_WORD2_SECURITY_LABEL
+#endif
 };
 static const u32 nfs4_pnfs_open_bitmap[3] = {
@@ -161,7 +217,7 @@ static const u32 nfs4_open_noattr_bitmap[3] = {
        | FATTR4_WORD0_FILEID,
 };
-const u32 nfs4_statfs_bitmap[2] = {
+const u32 nfs4_statfs_bitmap[3] = {
        FATTR4_WORD0_FILES_AVAIL
        | FATTR4_WORD0_FILES_FREE
        | FATTR4_WORD0_FILES_TOTAL,
@@ -170,7 +226,7 @@ const u32 nfs4_statfs_bitmap[2] = {
        | FATTR4_WORD1_SPACE_TOTAL
 };
-const u32 nfs4_pathconf_bitmap[2] = {
+const u32 nfs4_pathconf_bitmap[3] = {
        FATTR4_WORD0_MAXLINK
        | FATTR4_WORD0_MAXNAME,
        0
@@ -185,7 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
                        FATTR4_WORD2_LAYOUT_BLKSIZE
 };
-const u32 nfs4_fs_locations_bitmap[2] = {
+const u32 nfs4_fs_locations_bitmap[3] = {
        FATTR4_WORD0_TYPE
        | FATTR4_WORD0_CHANGE
        | FATTR4_WORD0_SIZE
@@ -201,7 +257,7 @@ const u32 nfs4_fs_locations_bitmap[2] = {
        | FATTR4_WORD1_TIME_ACCESS
        | FATTR4_WORD1_TIME_METADATA
        | FATTR4_WORD1_TIME_MODIFY
-        | FATTR4_WORD1_MOUNTED_ON_FILEID
+        | FATTR4_WORD1_MOUNTED_ON_FILEID,
 };
 static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
@@ -268,7 +324,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
                *timeout = NFS4_POLL_RETRY_MIN;
        if (*timeout > NFS4_POLL_RETRY_MAX)
                *timeout = NFS4_POLL_RETRY_MAX;
-        freezable_schedule_timeout_killable(*timeout);
+        freezable_schedule_timeout_killable_unsafe(*timeout);
        if (fatal_signal_pending(current))
                res = -ERESTARTSYS;
        *timeout <<= 1;
@@ -762,6 +818,7 @@ struct nfs4_opendata {
        struct nfs4_string owner_name;
        struct nfs4_string group_name;
        struct nfs_fattr f_attr;
+        struct nfs4_label *f_label;
        struct dentry *dir;
        struct dentry *dentry;
        struct nfs4_state_owner *owner;
@@ -807,6 +864,7 @@ nfs4_map_atomic_open_claim(struct nfs_server *server,
 static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 {
        p->o_res.f_attr = &p->f_attr;
+        p->o_res.f_label = p->f_label;
        p->o_res.seqid = p->o_arg.seqid;
        p->c_res.seqid = p->c_arg.seqid;
        p->o_res.server = p->o_arg.server;
@@ -818,6 +876,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
                struct nfs4_state_owner *sp, fmode_t fmode, int flags,
                const struct iattr *attrs,
+                struct nfs4_label *label,
                enum open_claim_type4 claim,
                gfp_t gfp_mask)
 {
@@ -829,9 +888,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        p = kzalloc(sizeof(*p), gfp_mask);
        if (p == NULL)
                goto err;
+        p->f_label = nfs4_label_alloc(server, gfp_mask);
+        if (IS_ERR(p->f_label))
+                goto err_free_p;
        p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
        if (p->o_arg.seqid == NULL)
-                goto err_free;
+                goto err_free_label;
        nfs_sb_active(dentry->d_sb);
        p->dentry = dget(dentry);
        p->dir = parent;
@@ -852,8 +916,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
        p->o_arg.name = &dentry->d_name;
        p->o_arg.server = server;
-        p->o_arg.bitmask = server->attr_bitmask;
+        p->o_arg.bitmask = nfs4_bitmask(server, label);
        p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
+        p->o_arg.label = label;
        p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
        switch (p->o_arg.claim) {
        case NFS4_OPEN_CLAIM_NULL:
@@ -884,7 +949,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        nfs4_init_opendata_res(p);
        kref_init(&p->kref);
        return p;
-err_free:
+err_free_label:
+        nfs4_label_free(p->f_label);
+err_free_p:
        kfree(p);
 err:
        dput(parent);
@@ -901,6 +969,9 @@ static void nfs4_opendata_free(struct kref *kref)
        if (p->state != NULL)
                nfs4_put_open_state(p->state);
        nfs4_put_state_owner(p->owner);
+        nfs4_label_free(p->f_label);
        dput(p->dir);
        dput(p->dentry);
        nfs_sb_deactive(sb);
@@ -1179,6 +1250,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
        if (ret)
                goto err;
+        nfs_setsecurity(inode, &data->f_attr, data->f_label);
        if (data->o_res.delegation_type != 0)
                nfs4_opendata_check_deleg(data, state);
        update_open_stateid(state, &data->o_res.stateid, NULL,
@@ -1205,7 +1278,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
        ret = -EAGAIN;
        if (!(data->f_attr.valid & NFS_ATTR_FATTR))
                goto err;
-        inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr);
+        inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);
        ret = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto err;
@@ -1258,7 +1331,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
        struct nfs4_opendata *opendata;
        opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
-                        NULL, claim, GFP_NOFS);
+                        NULL, NULL, claim, GFP_NOFS);
        if (opendata == NULL)
                return ERR_PTR(-ENOMEM);
        opendata->state = state;
@@ -1784,7 +1857,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
                        return status;
        }
        if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
-                _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr);
+                _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
        return 0;
 }
@@ -1855,18 +1928,30 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        nfs4_stateid *stateid = &state->stateid;
-        int status;
+        struct nfs_delegation *delegation;
+        struct rpc_cred *cred = NULL;
+        int status = -NFS4ERR_BAD_STATEID;
        /* If a state reset has been done, test_stateid is unneeded */
        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
                return;
-        status = nfs41_test_stateid(server, stateid);
+        /* Get the delegation credential for use by test/free_stateid */
+        rcu_read_lock();
+        delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+        if (delegation != NULL &&
+            nfs4_stateid_match(&delegation->stateid, stateid)) {
+                cred = get_rpccred(delegation->cred);
+                rcu_read_unlock();
+                status = nfs41_test_stateid(server, stateid, cred);
+        } else
+                rcu_read_unlock();
        if (status != NFS_OK) {
                /* Free the stateid unless the server explicitly
                 * informs us the stateid is unrecognized. */
                if (status != -NFS4ERR_BAD_STATEID)
-                        nfs41_free_stateid(server, stateid);
+                        nfs41_free_stateid(server, stateid, cred);
                nfs_remove_bad_delegation(state->inode);
                write_seqlock(&state->seqlock);
@@ -1874,6 +1959,9 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
                write_sequnlock(&state->seqlock);
                clear_bit(NFS_DELEGATED_STATE, &state->flags);
        }
+        if (cred != NULL)
+                put_rpccred(cred);
 }
 /**
@@ -1888,6 +1976,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        nfs4_stateid *stateid = &state->open_stateid;
+        struct rpc_cred *cred = state->owner->so_cred;
        int status;
        /* If a state reset has been done, test_stateid is unneeded */
@@ -1896,12 +1985,12 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
            (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
                return -NFS4ERR_BAD_STATEID;
-        status = nfs41_test_stateid(server, stateid);
+        status = nfs41_test_stateid(server, stateid, cred);
        if (status != NFS_OK) {
                /* Free the stateid unless the server explicitly
                 * informs us the stateid is unrecognized. */
                if (status != -NFS4ERR_BAD_STATEID)
-                        nfs41_free_stateid(server, stateid);
+                        nfs41_free_stateid(server, stateid, cred);
                clear_bit(NFS_O_RDONLY_STATE, &state->flags);
                clear_bit(NFS_O_WRONLY_STATE, &state->flags);
@@ -1942,10 +2031,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
 static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
                fmode_t fmode,
                int flags,
-                struct nfs4_state **res)
+                struct nfs_open_context *ctx)
 {
        struct nfs4_state_owner *sp = opendata->owner;
        struct nfs_server *server = sp->so_server;
+        struct dentry *dentry;
        struct nfs4_state *state;
        unsigned int seq;
        int ret;
@@ -1963,13 +2053,31 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
        if (server->caps & NFS_CAP_POSIX_LOCK)
                set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+        dentry = opendata->dentry;
+        if (dentry->d_inode == NULL) {
+                /* FIXME: Is this d_drop() ever needed? */
+                d_drop(dentry);
+                dentry = d_add_unique(dentry, igrab(state->inode));
+                if (dentry == NULL) {
+                        dentry = opendata->dentry;
+                } else if (dentry != ctx->dentry) {
+                        dput(ctx->dentry);
+                        ctx->dentry = dget(dentry);
+                }
+                nfs_set_verifier(dentry,
+                                nfs_save_change_attribute(opendata->dir->d_inode));
+        }
        ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
        if (ret != 0)
                goto out;
-        if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+        ctx->state = state;
-                nfs4_schedule_stateid_recovery(server, state);
+        if (dentry->d_inode == state->inode) {
-        *res = state;
+                nfs_inode_attach_open_context(ctx);
+                if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+                        nfs4_schedule_stateid_recovery(server, state);
+        }
 out:
        return ret;
 }
@@ -1978,19 +2086,21 @@ out:
 * Returns a referenced nfs4_state
 */
 static int _nfs4_do_open(struct inode *dir,
-                        struct dentry *dentry,
+                        struct nfs_open_context *ctx,
-                        fmode_t fmode,
                        int flags,
                        struct iattr *sattr,
-                        struct rpc_cred *cred,
+                        struct nfs4_label *label)
-                        struct nfs4_state **res,
-                        struct nfs4_threshold **ctx_th)
 {
        struct nfs4_state_owner  *sp;
        struct nfs4_state     *state = NULL;
        struct nfs_server       *server = NFS_SERVER(dir);
        struct nfs4_opendata *opendata;
+        struct dentry *dentry = ctx->dentry;
+        struct rpc_cred *cred = ctx->cred;
+        struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
+        fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
        enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
+        struct nfs4_label *olabel = NULL;
        int status;
        /* Protect against reboot recovery conflicts */
@@ -2009,22 +2119,31 @@ static int _nfs4_do_open(struct inode *dir,
        if (dentry->d_inode)
                claim = NFS4_OPEN_CLAIM_FH;
        opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
-                        claim, GFP_KERNEL);
+                        label, claim, GFP_KERNEL);
        if (opendata == NULL)
                goto err_put_state_owner;
+        if (label) {
+                olabel = nfs4_label_alloc(server, GFP_KERNEL);
+                if (IS_ERR(olabel)) {
+                        status = PTR_ERR(olabel);
+                        goto err_opendata_put;
+                }
+        }
        if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
                opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
                if (!opendata->f_attr.mdsthreshold)
-                        goto err_opendata_put;
+                        goto err_free_label;
                opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
        }
        if (dentry->d_inode != NULL)
                opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
-        status = _nfs4_open_and_get_state(opendata, fmode, flags, &state);
+        status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx);
        if (status != 0)
-                goto err_opendata_put;
+                goto err_free_label;
+        state = ctx->state;
        if ((opendata->o_arg.open_flags & O_EXCL) &&
            (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
@@ -2033,10 +2152,12 @@ static int _nfs4_do_open(struct inode *dir,
                nfs_fattr_init(opendata->o_res.f_attr);
                status = nfs4_do_setattr(state->inode, cred,
                                opendata->o_res.f_attr, sattr,
-                                state);
+                                state, label, olabel);
-                if (status == 0)
+                if (status == 0) {
                        nfs_setattr_update_inode(state->inode, sattr);
-                nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+                        nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+                        nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+                }
        }
        if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
@@ -2045,38 +2166,37 @@ static int _nfs4_do_open(struct inode *dir,
                kfree(opendata->f_attr.mdsthreshold);
        opendata->f_attr.mdsthreshold = NULL;
+        nfs4_label_free(olabel);
        nfs4_opendata_put(opendata);
        nfs4_put_state_owner(sp);
-        *res = state;
        return 0;
+err_free_label:
+        nfs4_label_free(olabel);
 err_opendata_put:
        kfree(opendata->f_attr.mdsthreshold);
        nfs4_opendata_put(opendata);
 err_put_state_owner:
        nfs4_put_state_owner(sp);
 out_err:
-        *res = NULL;
        return status;
 }
 static struct nfs4_state *nfs4_do_open(struct inode *dir,
-                                        struct dentry *dentry,
+                                        struct nfs_open_context *ctx,
-                                        fmode_t fmode,
                                        int flags,
                                        struct iattr *sattr,
-                                        struct rpc_cred *cred,
+                                        struct nfs4_label *label)
-                                        struct nfs4_threshold **ctx_th)
 {
        struct nfs_server *server = NFS_SERVER(dir);
        struct nfs4_exception exception = { };
        struct nfs4_state *res;
        int status;
-        fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC;
        do {
-                status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred,
+                status = _nfs4_do_open(dir, ctx, flags, sattr, label);
-                                       &res, ctx_th);
+                res = ctx->state;
                if (status == 0)
                        break;
                /* NOTE: BAD_SEQID means the server and client disagree about the
@@ -2122,7 +2242,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
 static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                            struct nfs_fattr *fattr, struct iattr *sattr,
-                            struct nfs4_state *state)
+                            struct nfs4_state *state, struct nfs4_label *ilabel,
+                            struct nfs4_label *olabel)
 {
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_setattrargs  arg = {
@@ -2130,9 +2251,11 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                .iap            = sattr,
                .server         = server,
                .bitmask = server->attr_bitmask,
+                .label          = ilabel,
        };
        struct nfs_setattrres  res = {
                .fattr          = fattr,
+                .label          = olabel,
                .server         = server,
        };
        struct rpc_message msg = {
@@ -2146,6 +2269,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        bool truncate;
        int status;
+        arg.bitmask = nfs4_bitmask(server, ilabel);
+        if (ilabel)
+                arg.bitmask = nfs4_bitmask(server, olabel);
        nfs_fattr_init(fattr);
        /* Servers should only apply open mode checks for file size changes */
@@ -2172,7 +2299,8 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                           struct nfs_fattr *fattr, struct iattr *sattr,
-                           struct nfs4_state *state)
+                           struct nfs4_state *state, struct nfs4_label *ilabel,
+                           struct nfs4_label *olabel)
 {
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs4_exception exception = {
@@ -2181,7 +2309,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        };
        int err;
        do {
-                err = _nfs4_do_setattr(inode, cred, fattr, sattr, state);
+                err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
                switch (err) {
                case -NFS4ERR_OPENMODE:
                        if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -2426,14 +2554,18 @@ static struct inode *
 nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
 {
        struct nfs4_state *state;
+        struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
+        label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
        /* Protect against concurrent sillydeletes */
-        state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr,
+        state = nfs4_do_open(dir, ctx, open_flags, attr, label);
-                             ctx->cred, &ctx->mdsthreshold);
+        nfs4_label_release_security(label);
        if (IS_ERR(state))
                return ERR_CAST(state);
-        ctx->state = state;
+        return state->inode;
-        return igrab(state->inode);
 }
 static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2489,7 +2621,17 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
                        server->caps |= NFS_CAP_CTIME;
                if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
                        server->caps |= NFS_CAP_MTIME;
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+                if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
+                        server->caps |= NFS_CAP_SECURITY_LABEL;
+#endif
+                memcpy(server->attr_bitmask_nl, res.attr_bitmask,
+                                sizeof(server->attr_bitmask));
+                if (server->caps & NFS_CAP_SECURITY_LABEL) {
+                        server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+                        res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+                }
                memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
                server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
                server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
@@ -2515,8 +2657,9 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
                struct nfs_fsinfo *info)
 {
+        u32 bitmask[3];
        struct nfs4_lookup_root_arg args = {
-                .bitmask = nfs4_fattr_bitmap,
+                .bitmask = bitmask,
        };
        struct nfs4_lookup_res res = {
                .server = server,
@@ -2529,6 +2672,13 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
                .rpc_resp = &res,
        };
+        bitmask[0] = nfs4_fattr_bitmap[0];
+        bitmask[1] = nfs4_fattr_bitmap[1];
+        /*
+         * Process the label in the upcoming getfattr
+         */
+        bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL;
        nfs_fattr_init(info->fattr);
        return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 }
@@ -2648,6 +2798,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
 {
        int error;
        struct nfs_fattr *fattr = info->fattr;
+        struct nfs4_label *label = NULL;
        error = nfs4_server_capabilities(server, mntfh);
        if (error < 0) {
@@ -2655,16 +2806,23 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
                return error;
        }
-        error = nfs4_proc_getattr(server, mntfh, fattr);
+        label = nfs4_label_alloc(server, GFP_KERNEL);
+        if (IS_ERR(label))
+                return PTR_ERR(label);
+        error = nfs4_proc_getattr(server, mntfh, fattr, label);
        if (error < 0) {
                dprintk("nfs4_get_root: getattr error = %d\n", -error);
-                return error;
+                goto err_free_label;
        }
        if (fattr->valid & NFS_ATTR_FATTR_FSID &&
            !nfs_fsid_equal(&server->fsid, &fattr->fsid))
                memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+err_free_label:
+        nfs4_label_free(label);
        return error;
 }
@@ -2711,7 +2869,8 @@ out:
        return status;
 }
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+                                struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct nfs4_getattr_arg args = {
                .fh = fhandle,
@@ -2719,6 +2878,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
        };
        struct nfs4_getattr_res res = {
                .fattr = fattr,
+                .label = label,
                .server = server,
        };
        struct rpc_message msg = {
@@ -2726,18 +2886,21 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
                .rpc_argp = &args,
                .rpc_resp = &res,
        };
-        
+        args.bitmask = nfs4_bitmask(server, label);
        nfs_fattr_init(fattr);
        return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 }
-static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+                                struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct nfs4_exception exception = { };
        int err;
        do {
                err = nfs4_handle_exception(server,
-                                _nfs4_proc_getattr(server, fhandle, fattr),
+                                _nfs4_proc_getattr(server, fhandle, fattr, label),
                                &exception);
        } while (exception.retry);
        return err;
@@ -2767,6 +2930,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
        struct inode *inode = dentry->d_inode;
        struct rpc_cred *cred = NULL;
        struct nfs4_state *state = NULL;
+        struct nfs4_label *label = NULL;
        int status;
        if (pnfs_ld_layoutret_on_setattr(inode))
@@ -2793,15 +2957,22 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
                }
        }
-        status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
+        label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
-        if (status == 0)
+        if (IS_ERR(label))
+                return PTR_ERR(label);
+        status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
+        if (status == 0) {
                nfs_setattr_update_inode(inode, sattr);
+                nfs_setsecurity(inode, fattr, label);
+        }
+        nfs4_label_free(label);
        return status;
 }
 static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
                const struct qstr *name, struct nfs_fh *fhandle,
-                struct nfs_fattr *fattr)
+                struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct nfs_server *server = NFS_SERVER(dir);
        int                    status;
@@ -2813,6 +2984,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
        struct nfs4_lookup_res res = {
                .server = server,
                .fattr = fattr,
+                .label = label,
                .fh = fhandle,
        };
        struct rpc_message msg = {
@@ -2821,6 +2993,8 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
                .rpc_resp = &res,
        };
+        args.bitmask = nfs4_bitmask(server, label);
        nfs_fattr_init(fattr);
        dprintk("NFS call  lookup %s\n", name->name);
@@ -2839,13 +3013,13 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
 static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
                                   struct qstr *name, struct nfs_fh *fhandle,
-                                   struct nfs_fattr *fattr)
+                                   struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct nfs4_exception exception = { };
        struct rpc_clnt *client = *clnt;
        int err;
        do {
-                err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr);
+                err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label);
                switch (err) {
                case -NFS4ERR_BADNAME:
                        err = -ENOENT;
@@ -2879,12 +3053,13 @@ out:
 }
 static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
-                            struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+                            struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+                            struct nfs4_label *label)
 {
        int status;
        struct rpc_clnt *client = NFS_CLIENT(dir);
-        status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr);
+        status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label);
        if (client != NFS_CLIENT(dir)) {
                rpc_shutdown_client(client);
                nfs_fixup_secinfo_attributes(fattr);
@@ -2896,15 +3071,13 @@ struct rpc_clnt *
 nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name,
                            struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
+        struct rpc_clnt *client = NFS_CLIENT(dir);
        int status;
-        struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir));
-        status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr);
+        status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL);
-        if (status < 0) {
+        if (status < 0)
-                rpc_shutdown_client(client);
                return ERR_PTR(status);
-        }
+        return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client;
-        return client;
 }
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
@@ -2924,7 +3097,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
                .rpc_cred = entry->cred,
        };
        int mode = entry->mask;
-        int status;
+        int status = 0;
        /*
         * Determine which access bits we want to ask for...
@@ -3029,6 +3202,7 @@ static int
 nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                 int flags)
 {
+        struct nfs4_label l, *ilabel = NULL;
        struct nfs_open_context *ctx;
        struct nfs4_state *state;
        int status = 0;
@@ -3037,19 +3211,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        if (IS_ERR(ctx))
                return PTR_ERR(ctx);
+        ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
        sattr->ia_mode &= ~current_umask();
-        state = nfs4_do_open(dir, dentry, ctx->mode,
+        state = nfs4_do_open(dir, ctx, flags, sattr, ilabel);
-                        flags, sattr, ctx->cred,
-                        &ctx->mdsthreshold);
-        d_drop(dentry);
        if (IS_ERR(state)) {
                status = PTR_ERR(state);
                goto out;
        }
-        d_add(dentry, igrab(state->inode));
-        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-        ctx->state = state;
 out:
+        nfs4_label_release_security(ilabel);
        put_nfs_open_context(ctx);
        return status;
 }
@@ -3098,6 +3269,8 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
        res->server = server;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
        nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
+        nfs_fattr_init(res->dir_attr);
 }
 static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
@@ -3173,7 +3346,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
                .rpc_resp = &res,
        };
        int status = -ENOMEM;
-        
        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
        if (!status) {
                update_changeattr(old_dir, &res.old_cinfo);
@@ -3207,6 +3380,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
        };
        struct nfs4_link_res res = {
                .server = server,
+                .label = NULL,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
@@ -3219,11 +3393,24 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
        if (res.fattr == NULL)
                goto out;
+        res.label = nfs4_label_alloc(server, GFP_KERNEL);
+        if (IS_ERR(res.label)) {
+                status = PTR_ERR(res.label);
+                goto out;
+        }
+        arg.bitmask = nfs4_bitmask(server, res.label);
        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
        if (!status) {
                update_changeattr(dir, &res.cinfo);
-                nfs_post_op_update_inode(inode, res.fattr);
+                status = nfs_post_op_update_inode(inode, res.fattr);
+                if (!status)
+                        nfs_setsecurity(inode, res.fattr, res.label);
        }
+        nfs4_label_free(res.label);
 out:
        nfs_free_fattr(res.fattr);
        return status;
@@ -3247,6 +3434,7 @@ struct nfs4_createdata {
        struct nfs4_create_res res;
        struct nfs_fh fh;
        struct nfs_fattr fattr;
+        struct nfs4_label *label;
 };
 static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
@@ -3258,6 +3446,10 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
        if (data != NULL) {
                struct nfs_server *server = NFS_SERVER(dir);
+                data->label = nfs4_label_alloc(server, GFP_KERNEL);
+                if (IS_ERR(data->label))
+                        goto out_free;
                data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
                data->msg.rpc_argp = &data->arg;
                data->msg.rpc_resp = &data->res;
@@ -3266,13 +3458,17 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
                data->arg.name = name;
                data->arg.attrs = sattr;
                data->arg.ftype = ftype;
-                data->arg.bitmask = server->attr_bitmask;
+                data->arg.bitmask = nfs4_bitmask(server, data->label);
                data->res.server = server;
                data->res.fh = &data->fh;
                data->res.fattr = &data->fattr;
+                data->res.label = data->label;
                nfs_fattr_init(data->res.fattr);
        }
        return data;
+out_free:
+        kfree(data);
+        return NULL;
 }
 static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
@@ -3281,18 +3477,20 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
                                    &data->arg.seq_args, &data->res.seq_res, 1);
        if (status == 0) {
                update_changeattr(dir, &data->res.dir_cinfo);
-                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
        }
        return status;
 }
 static void nfs4_free_createdata(struct nfs4_createdata *data)
 {
+        nfs4_label_free(data->label);
        kfree(data);
 }
 static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
-                struct page *page, unsigned int len, struct iattr *sattr)
+                struct page *page, unsigned int len, struct iattr *sattr,
+                struct nfs4_label *label)
 {
        struct nfs4_createdata *data;
        int status = -ENAMETOOLONG;
@@ -3308,6 +3506,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
        data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
        data->arg.u.symlink.pages = &page;
        data->arg.u.symlink.len = len;
+        data->arg.label = label;
        
        status = nfs4_do_create(dir, dentry, data);
@@ -3320,18 +3519,24 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
                struct page *page, unsigned int len, struct iattr *sattr)
 {
        struct nfs4_exception exception = { };
+        struct nfs4_label l, *label = NULL;
        int err;
+        label = nfs4_label_init_security(dir, dentry, sattr, &l);
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
                                _nfs4_proc_symlink(dir, dentry, page,
-                                                        len, sattr),
+                                                        len, sattr, label),
                                &exception);
        } while (exception.retry);
+        nfs4_label_release_security(label);
        return err;
 }
 static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
-                struct iattr *sattr)
+                struct iattr *sattr, struct nfs4_label *label)
 {
        struct nfs4_createdata *data;
        int status = -ENOMEM;
@@ -3340,6 +3545,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
        if (data == NULL)
                goto out;
+        data->arg.label = label;
        status = nfs4_do_create(dir, dentry, data);
        nfs4_free_createdata(data);
@@ -3351,14 +3557,19 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
                struct iattr *sattr)
 {
        struct nfs4_exception exception = { };
+        struct nfs4_label l, *label = NULL;
        int err;
+        label = nfs4_label_init_security(dir, dentry, sattr, &l);
        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
-                                _nfs4_proc_mkdir(dir, dentry, sattr),
+                                _nfs4_proc_mkdir(dir, dentry, sattr, label),
                                &exception);
        } while (exception.retry);
+        nfs4_label_release_security(label);
        return err;
 }
@@ -3416,7 +3627,7 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 }
 static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
-                struct iattr *sattr, dev_t rdev)
+                struct iattr *sattr, struct nfs4_label *label, dev_t rdev)
 {
        struct nfs4_createdata *data;
        int mode = sattr->ia_mode;
@@ -3441,7 +3652,8 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
                status = -EINVAL;
                goto out_free;
        }
-        
+        data->arg.label = label;
        status = nfs4_do_create(dir, dentry, data);
 out_free:
        nfs4_free_createdata(data);
@@ -3453,14 +3665,20 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
                struct iattr *sattr, dev_t rdev)
 {
        struct nfs4_exception exception = { };
+        struct nfs4_label l, *label = NULL;
        int err;
+        label = nfs4_label_init_security(dir, dentry, sattr, &l);
        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
-                                _nfs4_proc_mknod(dir, dentry, sattr, rdev),
+                                _nfs4_proc_mknod(dir, dentry, sattr, label, rdev),
                                &exception);
        } while (exception.retry);
+        nfs4_label_release_security(label);
        return err;
 }
@@ -4187,6 +4405,155 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
        return err;
 }
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static int _nfs4_get_security_label(struct inode *inode, void *buf,
+                                        size_t buflen)
+{
+        struct nfs_server *server = NFS_SERVER(inode);
+        struct nfs_fattr fattr;
+        struct nfs4_label label = {0, 0, buflen, buf};
+        u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+        struct nfs4_getattr_arg args = {
+                .fh             = NFS_FH(inode),
+                .bitmask        = bitmask,
+        };
+        struct nfs4_getattr_res res = {
+                .fattr          = &fattr,
+                .label          = &label,
+                .server         = server,
+        };
+        struct rpc_message msg = {
+                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
+                .rpc_argp       = &args,
+                .rpc_resp       = &res,
+        };
+        int ret;
+        nfs_fattr_init(&fattr);
+        ret = rpc_call_sync(server->client, &msg, 0);
+        if (ret)
+                return ret;
+        if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
+                return -ENOENT;
+        if (buflen < label.len)
+                return -ERANGE;
+        return 0;
+}
+static int nfs4_get_security_label(struct inode *inode, void *buf,
+                                        size_t buflen)
+{
+        struct nfs4_exception exception = { };
+        int err;
+        if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+                return -EOPNOTSUPP;
+        do {
+                err = nfs4_handle_exception(NFS_SERVER(inode),
+                                _nfs4_get_security_label(inode, buf, buflen),
+                                &exception);
+        } while (exception.retry);
+        return err;
+}
+static int _nfs4_do_set_security_label(struct inode *inode,
+                struct nfs4_label *ilabel,
+                struct nfs_fattr *fattr,
+                struct nfs4_label *olabel)
+{
+        struct iattr sattr = {0};
+        struct nfs_server *server = NFS_SERVER(inode);
+        const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+        struct nfs_setattrargs args = {
+                .fh             = NFS_FH(inode),
+                .iap            = &sattr,
+                .server         = server,
+                .bitmask        = bitmask,
+                .label          = ilabel,
+        };
+        struct nfs_setattrres res = {
+                .fattr          = fattr,
+                .label          = olabel,
+                .server         = server,
+        };
+        struct rpc_message msg = {
+                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+                .rpc_argp       = &args,
+                .rpc_resp       = &res,
+        };
+        int status;
+        nfs4_stateid_copy(&args.stateid, &zero_stateid);
+        status = rpc_call_sync(server->client, &msg, 0);
+        if (status)
+                dprintk("%s failed: %d\n", __func__, status);
+        return status;
+}
+static int nfs4_do_set_security_label(struct inode *inode,
+                struct nfs4_label *ilabel,
+                struct nfs_fattr *fattr,
+                struct nfs4_label *olabel)
+{
+        struct nfs4_exception exception = { };
+        int err;
+        do {
+                err = nfs4_handle_exception(NFS_SERVER(inode),
+                                _nfs4_do_set_security_label(inode, ilabel,
+                                fattr, olabel),
+                                &exception);
+        } while (exception.retry);
+        return err;
+}
+static int
+nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
+{
+        struct nfs4_label ilabel, *olabel = NULL;
+        struct nfs_fattr fattr;
+        struct rpc_cred *cred;
+        struct inode *inode = dentry->d_inode;
+        int status;
+        if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+                return -EOPNOTSUPP;
+        nfs_fattr_init(&fattr);
+        ilabel.pi = 0;
+        ilabel.lfs = 0;
+        ilabel.label = (char *)buf;
+        ilabel.len = buflen;
+        cred = rpc_lookup_cred();
+        if (IS_ERR(cred))
+                return PTR_ERR(cred);
+        olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+        if (IS_ERR(olabel)) {
+                status = -PTR_ERR(olabel);
+                goto out;
+        }
+        status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel);
+        if (status == 0)
+                nfs_setsecurity(inode, &fattr, olabel);
+        nfs4_label_free(olabel);
+out:
+        put_rpccred(cred);
+        return status;
+}
+#endif  /* CONFIG_NFS_V4_SECURITY_LABEL */
 static int
 nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
 {
@@ -4345,7 +4712,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
        /* cb_client4 */
        rcu_read_lock();
        setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
-                                sizeof(setclientid.sc_netid),
+                                sizeof(setclientid.sc_netid), "%s",
                                rpc_peeraddr2str(clp->cl_rpcclient,
                                                        RPC_DISPLAY_NETID));
        rcu_read_unlock();
@@ -4528,7 +4895,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
 static unsigned long
 nfs4_set_lock_task_retry(unsigned long timeout)
 {
-        freezable_schedule_timeout_killable(timeout);
+        freezable_schedule_timeout_killable_unsafe(timeout);
        timeout <<= 1;
        if (timeout > NFS4_LOCK_MAXTIMEOUT)
                return NFS4_LOCK_MAXTIMEOUT;
@@ -5056,13 +5423,18 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
        list_for_each_entry(lsp, &state->lock_states, ls_locks) {
                if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-                        status = nfs41_test_stateid(server, &lsp->ls_stateid);
+                        struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+                        status = nfs41_test_stateid(server,
+                                        &lsp->ls_stateid,
+                                        cred);
                        if (status != NFS_OK) {
                                /* Free the stateid unless the server
                                 * informs us the stateid is unrecognized. */
                                if (status != -NFS4ERR_BAD_STATEID)
                                        nfs41_free_stateid(server,
-                                                        &lsp->ls_stateid);
+                                                        &lsp->ls_stateid,
+                                                        cred);
                                clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
                                ret = status;
                        }
@@ -5295,6 +5667,53 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
        return len;
 }
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline int nfs4_server_supports_labels(struct nfs_server *server)
+{
+        return server->caps & NFS_CAP_SECURITY_LABEL;
+}
+static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
+                                   const void *buf, size_t buflen,
+                                   int flags, int type)
+{
+        if (security_ismaclabel(key))
+                return nfs4_set_security_label(dentry, buf, buflen);
+        return -EOPNOTSUPP;
+}
+static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
+                                   void *buf, size_t buflen, int type)
+{
+        if (security_ismaclabel(key))
+                return nfs4_get_security_label(dentry->d_inode, buf, buflen);
+        return -EOPNOTSUPP;
+}
+static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
+                                       size_t list_len, const char *name,
+                                       size_t name_len, int type)
+{
+        size_t len = 0;
+        if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) {
+                len = security_inode_listsecurity(dentry->d_inode, NULL, 0);
+                if (list && len <= list_len)
+                        security_inode_listsecurity(dentry->d_inode, list, len);
+        }
+        return len;
+}
+static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .list   = nfs4_xattr_list_nfs4_label,
+        .get    = nfs4_xattr_get_nfs4_label,
+        .set    = nfs4_xattr_set_nfs4_label,
+};
+#endif
 /*
 * nfs_fhget will use either the mounted_on_fileid or the fileid
 */
@@ -5318,7 +5737,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
                                   struct page *page)
 {
        struct nfs_server *server = NFS_SERVER(dir);
-        u32 bitmask[2] = {
+        u32 bitmask[3] = {
                [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
        };
        struct nfs4_fs_locations_arg args = {
@@ -5505,7 +5924,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        struct nfs41_exchange_id_args args = {
                .verifier = &verifier,
                .client = clp,
-                .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
+                .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+                        EXCHGID4_FLAG_BIND_PRINC_STATEID,
        };
        struct nfs41_exchange_id_res res = {
                0
@@ -5762,17 +6182,14 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
 */
 static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 {
-        struct nfs4_session *session = args->client->cl_session;
+        unsigned int max_rqst_sz, max_resp_sz;
-        unsigned int mxrqst_sz = session->fc_target_max_rqst_sz,
-                     mxresp_sz = session->fc_target_max_resp_sz;
+        max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
+        max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
-        if (mxrqst_sz == 0)
-                mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
-        if (mxresp_sz == 0)
-                mxresp_sz = NFS_MAX_FILE_IO_SIZE;
        /* Fore channel attributes */
-        args->fc_attrs.max_rqst_sz = mxrqst_sz;
+        args->fc_attrs.max_rqst_sz = max_rqst_sz;
-        args->fc_attrs.max_resp_sz = mxresp_sz;
+        args->fc_attrs.max_resp_sz = max_resp_sz;
        args->fc_attrs.max_ops = NFS4_MAX_OPS;
        args->fc_attrs.max_reqs = max_session_slots;
@@ -6159,12 +6576,14 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
 /*
 * Issue a global reclaim complete.
 */
-static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
+static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
+                struct rpc_cred *cred)
 {
        struct nfs4_reclaim_complete_data *calldata;
        struct rpc_task *task;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
+                .rpc_cred = cred,
        };
        struct rpc_task_setup task_setup_data = {
                .rpc_client = clp->cl_rpcclient,
@@ -6348,6 +6767,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
                .rpc_argp = &lgp->args,
                .rpc_resp = &lgp->res,
+                .rpc_cred = lgp->cred,
        };
        struct rpc_task_setup task_setup_data = {
                .rpc_client = server->client,
@@ -6451,6 +6871,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
                .rpc_argp = &lrp->args,
                .rpc_resp = &lrp->res,
+                .rpc_cred = lrp->cred,
        };
        struct rpc_task_setup task_setup_data = {
                .rpc_client = lrp->clp->cl_rpcclient,
@@ -6520,7 +6941,9 @@ int nfs4_proc_getdevicelist(struct nfs_server *server,
 EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
 static int
-_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+_nfs4_proc_getdeviceinfo(struct nfs_server *server,
+                struct pnfs_device *pdev,
+                struct rpc_cred *cred)
 {
        struct nfs4_getdeviceinfo_args args = {
                .pdev = pdev,
@@ -6532,6 +6955,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
                .rpc_argp = &args,
                .rpc_resp = &res,
+                .rpc_cred = cred,
        };
        int status;
@@ -6542,14 +6966,16 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
        return status;
 }
-int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+                struct pnfs_device *pdev,
+                struct rpc_cred *cred)
 {
        struct nfs4_exception exception = { };
        int err;
        do {
                err = nfs4_handle_exception(server,
-                                        _nfs4_proc_getdeviceinfo(server, pdev),
+                                        _nfs4_proc_getdeviceinfo(server, pdev, cred),
                                        &exception);
        } while (exception.retry);
        return err;
@@ -6733,7 +7159,9 @@ out:
        return err;
 }
-static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int _nfs41_test_stateid(struct nfs_server *server,
+                nfs4_stateid *stateid,
+                struct rpc_cred *cred)
 {
        int status;
        struct nfs41_test_stateid_args args = {
@@ -6744,6 +7172,7 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
                .rpc_argp = &args,
                .rpc_resp = &res,
+                .rpc_cred = cred,
        };
        dprintk("NFS call  test_stateid %p\n", stateid);
@@ -6764,17 +7193,20 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
 *
 * @server: server / transport on which to perform the operation
 * @stateid: state ID to test
+ * @cred: credential
 *
 * Returns NFS_OK if the server recognizes that "stateid" is valid.
 * Otherwise a negative NFS4ERR value is returned if the operation
 * failed or the state ID is not currently valid.
 */
-static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int nfs41_test_stateid(struct nfs_server *server,
+                nfs4_stateid *stateid,
+                struct rpc_cred *cred)
 {
        struct nfs4_exception exception = { };
        int err;
        do {
-                err = _nfs41_test_stateid(server, stateid);
+                err = _nfs41_test_stateid(server, stateid, cred);
                if (err != -NFS4ERR_DELAY)
                        break;
                nfs4_handle_exception(server, err, &exception);
@@ -6823,10 +7255,12 @@ const struct rpc_call_ops nfs41_free_stateid_ops = {
 static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
                nfs4_stateid *stateid,
+                struct rpc_cred *cred,
                bool privileged)
 {
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
+                .rpc_cred = cred,
        };
        struct rpc_task_setup task_setup = {
                .rpc_client = server->client,
@@ -6859,16 +7293,19 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
 *
 * @server: server / transport on which to perform the operation
 * @stateid: state ID to release
+ * @cred: credential
 *
 * Returns NFS_OK if the server freed "stateid".  Otherwise a
 * negative NFS4ERR value is returned.
 */
-static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int nfs41_free_stateid(struct nfs_server *server,
+                nfs4_stateid *stateid,
+                struct rpc_cred *cred)
 {
        struct rpc_task *task;
        int ret;
-        task = _nfs41_free_stateid(server, stateid, true);
+        task = _nfs41_free_stateid(server, stateid, cred, true);
        if (IS_ERR(task))
                return PTR_ERR(task);
        ret = rpc_wait_for_completion_task(task);
@@ -6881,8 +7318,9 @@ static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
 static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
        struct rpc_task *task;
+        struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
-        task = _nfs41_free_stateid(server, &lsp->ls_stateid, false);
+        task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
        nfs4_free_lock_state(server, lsp);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -7004,11 +7442,33 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 };
 #endif
+#if defined(CONFIG_NFS_V4_2)
+static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
+        .minor_version = 2,
+        .init_caps = NFS_CAP_READDIRPLUS
+                | NFS_CAP_ATOMIC_OPEN
+                | NFS_CAP_CHANGE_ATTR
+                | NFS_CAP_POSIX_LOCK
+                | NFS_CAP_STATEID_NFSV41
+                | NFS_CAP_ATOMIC_OPEN_V1,
+        .call_sync = nfs4_call_sync_sequence,
+        .match_stateid = nfs41_match_stateid,
+        .find_root_sec = nfs41_find_root_sec,
+        .free_lock_state = nfs41_free_lock_state,
+        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+        .state_renewal_ops = &nfs41_state_renewal_ops,
+};
+#endif
 const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
        [0] = &nfs_v4_0_minor_ops,
 #if defined(CONFIG_NFS_V4_1)
        [1] = &nfs_v4_1_minor_ops,
 #endif
+#if defined(CONFIG_NFS_V4_2)
+        [2] = &nfs_v4_2_minor_ops,
+#endif
 };
 const struct inode_operations nfs4_dir_inode_operations = {
@@ -7108,6 +7568,9 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
 const struct xattr_handler *nfs4_xattr_handlers[] = {
        &nfs4_xattr_nfs4_acl_handler,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+        &nfs4_xattr_nfs4_label_handler,
+#endif
        NULL
 };
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index c4e225e4a9af..36e21cb29d65 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -478,48 +478,12 @@ static int nfs41_check_session_ready(struct nfs_client *clp)
        return 0;
 }
-int nfs4_init_session(struct nfs_server *server)
+int nfs4_init_session(struct nfs_client *clp)
 {
-        struct nfs_client *clp = server->nfs_client;
-        struct nfs4_session *session;
-        unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE;
-        unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE;
        if (!nfs4_has_session(clp))
                return 0;
-        if (server->rsize != 0)
+        clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state);
-                target_max_resp_sz = server->rsize;
-        target_max_resp_sz += nfs41_maxread_overhead;
-        if (server->wsize != 0)
-                target_max_rqst_sz = server->wsize;
-        target_max_rqst_sz += nfs41_maxwrite_overhead;
-        session = clp->cl_session;
-        spin_lock(&clp->cl_lock);
-        if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
-                /* Initialise targets and channel attributes */
-                session->fc_target_max_rqst_sz = target_max_rqst_sz;
-                session->fc_attrs.max_rqst_sz = target_max_rqst_sz;
-                session->fc_target_max_resp_sz = target_max_resp_sz;
-                session->fc_attrs.max_resp_sz = target_max_resp_sz;
-        } else {
-                /* Just adjust the targets */
-                if (target_max_rqst_sz > session->fc_target_max_rqst_sz) {
-                        session->fc_target_max_rqst_sz = target_max_rqst_sz;
-                        set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
-                }
-                if (target_max_resp_sz > session->fc_target_max_resp_sz) {
-                        session->fc_target_max_resp_sz = target_max_resp_sz;
-                        set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
-                }
-        }
-        spin_unlock(&clp->cl_lock);
-        if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
-                nfs4_schedule_lease_recovery(clp);
        return nfs41_check_session_ready(clp);
 }
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index ff7d9f0f8a65..3a153d82b90c 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -66,9 +66,6 @@ struct nfs4_session {
        struct nfs4_channel_attrs       bc_attrs;
        struct nfs4_slot_table          bc_slot_table;
        struct nfs_client               *clp;
-        /* Create session arguments */
-        unsigned int                    fc_target_max_rqst_sz;
-        unsigned int                    fc_target_max_resp_sz;
 };
 enum nfs4_session_state {
@@ -89,7 +86,7 @@ extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern void nfs4_destroy_session(struct nfs4_session *session);
-extern int nfs4_init_session(struct nfs_server *server);
+extern int nfs4_init_session(struct nfs_client *clp);
 extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
 extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
@@ -122,7 +119,7 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
 #else /* defined(CONFIG_NFS_V4_1) */
-static inline int nfs4_init_session(struct nfs_server *server)
+static inline int nfs4_init_session(struct nfs_client *clp)
 {
        return 0;
 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1fab140764c4..e22862f13564 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -228,19 +228,8 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
        return status;
 }
-/*
+static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
- * Back channel returns NFS4ERR_DELAY for new requests when
- * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
- * is ended.
- */
-static void nfs4_end_drain_session(struct nfs_client *clp)
 {
-        struct nfs4_session *ses = clp->cl_session;
-        struct nfs4_slot_table *tbl;
-        if (ses == NULL)
-                return;
-        tbl = &ses->fc_slot_table;
        if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
                spin_lock(&tbl->slot_tbl_lock);
                nfs41_wake_slot_table(tbl);
@@ -248,6 +237,16 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
        }
 }
+static void nfs4_end_drain_session(struct nfs_client *clp)
+{
+        struct nfs4_session *ses = clp->cl_session;
+        if (ses != NULL) {
+                nfs4_end_drain_slot_table(&ses->bc_slot_table);
+                nfs4_end_drain_slot_table(&ses->fc_slot_table);
+        }
+}
 /*
 * Signal state manager thread if session fore channel is drained
 */
@@ -1194,7 +1193,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
        snprintf(buf, sizeof(buf), "%s-manager",
                        rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
        rcu_read_unlock();
-        task = kthread_run(nfs4_run_state_manager, clp, buf);
+        task = kthread_run(nfs4_run_state_manager, clp, "%s", buf);
        if (IS_ERR(task)) {
                printk(KERN_ERR "%s: kthread_run: %ld\n",
                        __func__, PTR_ERR(task));
@@ -1373,13 +1372,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
        /* Guard against delegation returns and new lock/unlock calls */
        down_write(&nfsi->rwsem);
        /* Protect inode->i_flock using the BKL */
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
                if (nfs_file_open_context(fl->fl_file)->state != state)
                        continue;
-                unlock_flocks();
+                spin_unlock(&inode->i_lock);
                status = ops->recover_lock(state, fl);
                switch (status) {
                        case 0:
@@ -1406,9 +1405,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
                                status = 0;
                }
-                lock_flocks();
+                spin_lock(&inode->i_lock);
        }
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
 out:
        up_write(&nfsi->rwsem);
        return status;
@@ -1563,11 +1562,12 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
 }
 static void nfs4_reclaim_complete(struct nfs_client *clp,
-                                 const struct nfs4_state_recovery_ops *ops)
+                                 const struct nfs4_state_recovery_ops *ops,
+                                 struct rpc_cred *cred)
 {
        /* Notify the server we're done reclaiming our state */
        if (ops->reclaim_complete)
-                (void)ops->reclaim_complete(clp);
+                (void)ops->reclaim_complete(clp, cred);
 }
 static void nfs4_clear_reclaim_server(struct nfs_server *server)
@@ -1612,9 +1612,15 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
 static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
 {
+        const struct nfs4_state_recovery_ops *ops;
+        struct rpc_cred *cred;
        if (!nfs4_state_clear_reclaim_reboot(clp))
                return;
-        nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
+        ops = clp->cl_mvops->reboot_recovery_ops;
+        cred = ops->get_clid_cred(clp);
+        nfs4_reclaim_complete(clp, ops, cred);
+        put_rpccred(cred);
 }
 static void nfs_delegation_clear_all(struct nfs_client *clp)
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index a5e1a3026d48..5dbe2d269210 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -9,6 +9,7 @@
 #include "delegation.h"
 #include "internal.h"
 #include "nfs4_fs.h"
+#include "dns_resolve.h"
 #include "pnfs.h"
 #include "nfs.h"
@@ -331,18 +332,24 @@ static int __init init_nfs_v4(void)
 {
        int err;
-        err = nfs_idmap_init();
+        err = nfs_dns_resolver_init();
        if (err)
                goto out;
-        err = nfs4_register_sysctl();
+        err = nfs_idmap_init();
        if (err)
                goto out1;
+        err = nfs4_register_sysctl();
+        if (err)
+                goto out2;
        register_nfs_version(&nfs_v4);
        return 0;
-out1:
+out2:
        nfs_idmap_quit();
+out1:
+        nfs_dns_resolver_destroy();
 out:
        return err;
 }
@@ -352,6 +359,7 @@ static void __exit exit_nfs_v4(void)
        unregister_nfs_version(&nfs_v4);
        nfs4_unregister_sysctl();
        nfs_idmap_quit();
+        nfs_dns_resolver_destroy();
 }
 MODULE_LICENSE("GPL");
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4be8d135ed61..3850b018815f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -102,12 +102,23 @@ static int nfs4_stat_to_errno(int);
 #define nfs4_path_maxsz         (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
 #define nfs4_owner_maxsz        (1 + XDR_QUADLEN(IDMAP_NAMESZ))
 #define nfs4_group_maxsz        (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
+#define nfs4_label_maxsz        (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
+#define encode_readdir_space 24
+#define encode_readdir_bitmask_sz 3
+#else
+#define nfs4_label_maxsz        0
+#define encode_readdir_space 20
+#define encode_readdir_bitmask_sz 2
+#endif
 /* We support only one layout type per file system */
 #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
 /* This is based on getfattr, which uses the most attributes: */
 #define nfs4_fattr_value_maxsz  (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
                                3 + 3 + 3 + nfs4_owner_maxsz + \
-                                nfs4_group_maxsz + decode_mdsthreshold_maxsz))
+                                nfs4_group_maxsz + nfs4_label_maxsz + \
+                                 decode_mdsthreshold_maxsz))
 #define nfs4_fattr_maxsz        (nfs4_fattr_bitmap_maxsz + \
                                nfs4_fattr_value_maxsz)
 #define decode_getattr_maxsz    (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
@@ -115,6 +126,7 @@ static int nfs4_stat_to_errno(int);
                                 1 + 2 + 1 + \
                                nfs4_owner_maxsz + \
                                nfs4_group_maxsz + \
+                                nfs4_label_maxsz + \
                                4 + 4)
 #define encode_savefh_maxsz     (op_encode_hdr_maxsz)
 #define decode_savefh_maxsz     (op_decode_hdr_maxsz)
@@ -192,9 +204,11 @@ static int nfs4_stat_to_errno(int);
                                 encode_stateid_maxsz + 3)
 #define decode_read_maxsz       (op_decode_hdr_maxsz + 2)
 #define encode_readdir_maxsz    (op_encode_hdr_maxsz + \
-                                 2 + encode_verifier_maxsz + 5)
+                                 2 + encode_verifier_maxsz + 5 + \
+                                nfs4_label_maxsz)
 #define decode_readdir_maxsz    (op_decode_hdr_maxsz + \
-                                 decode_verifier_maxsz)
+                                 decode_verifier_maxsz + \
+                                nfs4_label_maxsz + nfs4_fattr_maxsz)
 #define encode_readlink_maxsz   (op_encode_hdr_maxsz)
 #define decode_readlink_maxsz   (op_decode_hdr_maxsz + 1)
 #define encode_write_maxsz      (op_encode_hdr_maxsz + \
@@ -853,6 +867,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
                                     decode_sequence_maxsz +
                                     decode_putfh_maxsz) *
                                    XDR_UNIT);
+const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH +
+                                           compound_decode_hdr_maxsz +
+                                           decode_sequence_maxsz) *
+                                          XDR_UNIT);
+EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead);
 #endif /* CONFIG_NFS_V4_1 */
 static const umode_t nfs_type2fmt[] = {
@@ -968,7 +988,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
        encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
 }
-static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
+                                const struct nfs4_label *label,
+                                const struct nfs_server *server)
 {
        char owner_name[IDMAP_NAMESZ];
        char owner_group[IDMAP_NAMESZ];
@@ -977,17 +999,19 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
        __be32 *p;
        __be32 *q;
        int len;
+        uint32_t bmval_len = 2;
        uint32_t bmval0 = 0;
        uint32_t bmval1 = 0;
+        uint32_t bmval2 = 0;
        /*
         * We reserve enough space to write the entire attribute buffer at once.
         * In the worst-case, this would be
-         *   12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
+         * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
-         *          = 36 bytes, plus any contribution from variable-length fields
+         * = 40 bytes, plus any contribution from variable-length fields
         *            such as owner/group.
         */
-        len = 16;
+        len = 8;
        /* Sigh */
        if (iap->ia_valid & ATTR_SIZE)
@@ -1025,15 +1049,22 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                len += 16;
        else if (iap->ia_valid & ATTR_MTIME)
                len += 4;
+        if (label) {
+                len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
+                bmval_len = 3;
+        }
+        len += bmval_len << 2;
        p = reserve_space(xdr, len);
        /*
         * We write the bitmap length now, but leave the bitmap and the attribute
         * buffer length to be backfilled at the end of this routine.
         */
-        *p++ = cpu_to_be32(2);
+        *p++ = cpu_to_be32(bmval_len);
        q = p;
-        p += 3;
+        /* Skip bitmap entries + attrlen */
+        p += bmval_len + 1;
        if (iap->ia_valid & ATTR_SIZE) {
                bmval0 |= FATTR4_WORD0_SIZE;
@@ -1071,6 +1102,13 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
                *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
        }
+        if (label) {
+                bmval2 |= FATTR4_WORD2_SECURITY_LABEL;
+                *p++ = cpu_to_be32(label->lfs);
+                *p++ = cpu_to_be32(label->pi);
+                *p++ = cpu_to_be32(label->len);
+                p = xdr_encode_opaque_fixed(p, label->label, label->len);
+        }
        /*
         * Now we backfill the bitmap and the attribute buffer length.
@@ -1080,9 +1118,11 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                                len, ((char *)p - (char *)q) + 4);
                BUG();
        }
-        len = (char *)p - (char *)q - 12;
        *q++ = htonl(bmval0);
        *q++ = htonl(bmval1);
+        if (bmval_len == 3)
+                *q++ = htonl(bmval2);
+        len = (char *)p - (char *)(q + 1);
        *q = htonl(len);
 /* out: */
@@ -1136,7 +1176,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
        }
        encode_string(xdr, create->name->len, create->name->name);
-        encode_attrs(xdr, create->attrs, create->server);
+        encode_attrs(xdr, create->attrs, create->label, create->server);
 }
 static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1188,8 +1228,10 @@ encode_getattr_three(struct xdr_stream *xdr,
 static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-        encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
+        encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
-                           bitmask[1] & nfs4_fattr_bitmap[1], hdr);
+                           bitmask[1] & nfs4_fattr_bitmap[1],
+                           bitmask[2] & nfs4_fattr_bitmap[2],
+                           hdr);
 }
 static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
@@ -1367,11 +1409,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
        switch(arg->createmode) {
        case NFS4_CREATE_UNCHECKED:
                *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
-                encode_attrs(xdr, arg->u.attrs, arg->server);
+                encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
                break;
        case NFS4_CREATE_GUARDED:
                *p = cpu_to_be32(NFS4_CREATE_GUARDED);
-                encode_attrs(xdr, arg->u.attrs, arg->server);
+                encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
                break;
        case NFS4_CREATE_EXCLUSIVE:
                *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1381,7 +1423,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
                *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
                encode_nfs4_verifier(xdr, &arg->u.verifier);
                dummy.ia_valid = 0;
-                encode_attrs(xdr, &dummy, arg->server);
+                encode_attrs(xdr, &dummy, arg->label, arg->server);
        }
 }
@@ -1532,7 +1574,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
-        uint32_t attrs[2] = {
+        uint32_t attrs[3] = {
                FATTR4_WORD0_RDATTR_ERROR,
                FATTR4_WORD1_MOUNTED_ON_FILEID,
        };
@@ -1555,20 +1597,26 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
        encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
        encode_uint64(xdr, readdir->cookie);
        encode_nfs4_verifier(xdr, &readdir->verifier);
-        p = reserve_space(xdr, 20);
+        p = reserve_space(xdr, encode_readdir_space);
        *p++ = cpu_to_be32(dircount);
        *p++ = cpu_to_be32(readdir->count);
-        *p++ = cpu_to_be32(2);
+        *p++ = cpu_to_be32(encode_readdir_bitmask_sz);
        *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
-        *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+        *p   = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+        if (encode_readdir_bitmask_sz > 2) {
+                if (hdr->minorversion > 1)
+                        attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
+                p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]);
+        }
        memcpy(verf, readdir->verifier.data, sizeof(verf));
-        dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
+        dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
                        __func__,
                        (unsigned long long)readdir->cookie,
                        verf[0], verf[1],
                        attrs[0] & readdir->bitmask[0],
-                        attrs[1] & readdir->bitmask[1]);
+                        attrs[1] & readdir->bitmask[1],
+                        attrs[2] & readdir->bitmask[2]);
 }
 static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1627,7 +1675,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 {
        encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
        encode_nfs4_stateid(xdr, &arg->stateid);
-        encode_attrs(xdr, arg->iap, server);
+        encode_attrs(xdr, arg->iap, arg->label, server);
 }
 static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -1889,7 +1937,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
        p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
                                    NFS4_DEVICEID4_SIZE);
        *p++ = cpu_to_be32(args->pdev->layout_type);
-        *p++ = cpu_to_be32(args->pdev->pglen);          /* gdia_maxcount */
+        *p++ = cpu_to_be32(args->pdev->maxcount);       /* gdia_maxcount */
        *p++ = cpu_to_be32(0);                          /* bitmap length 0 */
 }
@@ -4038,6 +4086,56 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
        return status;
 }
+static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
+                                        struct nfs4_label *label)
+{
+        uint32_t pi = 0;
+        uint32_t lfs = 0;
+        __u32 len;
+        __be32 *p;
+        int status = 0;
+        if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U)))
+                return -EIO;
+        if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                lfs = be32_to_cpup(p++);
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                pi = be32_to_cpup(p++);
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                len = be32_to_cpup(p++);
+                p = xdr_inline_decode(xdr, len);
+                if (unlikely(!p))
+                        goto out_overflow;
+                if (len < NFS4_MAXLABELLEN) {
+                        if (label) {
+                                memcpy(label->label, p, len);
+                                label->len = len;
+                                label->pi = pi;
+                                label->lfs = lfs;
+                                status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
+                        }
+                        bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+                } else
+                        printk(KERN_WARNING "%s: label too long (%u)!\n",
+                                        __func__, len);
+        }
+        if (label && label->label)
+                dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
+                        (char *)label->label, label->len, label->pi, label->lfs);
+        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
 {
        int status = 0;
@@ -4380,7 +4478,7 @@ out_overflow:
 static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
                struct nfs_fattr *fattr, struct nfs_fh *fh,
-                struct nfs4_fs_locations *fs_loc,
+                struct nfs4_fs_locations *fs_loc, struct nfs4_label *label,
                const struct nfs_server *server)
 {
        int status;
@@ -4488,6 +4586,13 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
        if (status < 0)
                goto xdr_error;
+        if (label) {
+                status = decode_attr_security_label(xdr, bitmap, label);
+                if (status < 0)
+                        goto xdr_error;
+                fattr->valid |= status;
+        }
 xdr_error:
        dprintk("%s: xdr returned %d\n", __func__, -status);
        return status;
@@ -4495,7 +4600,7 @@ xdr_error:
 static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
                struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
-                const struct nfs_server *server)
+                struct nfs4_label *label, const struct nfs_server *server)
 {
        unsigned int savep;
        uint32_t attrlen,
@@ -4514,7 +4619,8 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
        if (status < 0)
                goto xdr_error;
-        status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server);
+        status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc,
+                                        label, server);
        if (status < 0)
                goto xdr_error;
@@ -4524,10 +4630,16 @@ xdr_error:
        return status;
 }
+static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+                struct nfs4_label *label, const struct nfs_server *server)
+{
+        return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server);
+}
 static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
                const struct nfs_server *server)
 {
-        return decode_getfattr_generic(xdr, fattr, NULL, NULL, server);
+        return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);
 }
 /*
@@ -5919,7 +6031,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_getfh(xdr, res->fh);
        if (status)
                goto out;
-        status = decode_getfattr(xdr, res->fattr, res->server);
+        status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
        return status;
 }
@@ -5945,7 +6057,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
                goto out;
        status = decode_getfh(xdr, res->fh);
        if (status == 0)
-                status = decode_getfattr(xdr, res->fattr, res->server);
+                status = decode_getfattr_label(xdr, res->fattr,
+                                                res->label, res->server);
 out:
        return status;
 }
@@ -6036,7 +6149,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_restorefh(xdr);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->fattr, res->server);
+        decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
        return status;
 }
@@ -6065,7 +6178,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_getfh(xdr, res->fh);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->fattr, res->server);
+        decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
        return status;
 }
@@ -6097,7 +6210,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_getfattr(xdr, res->fattr, res->server);
+        status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
        return status;
 }
@@ -6230,7 +6343,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
                goto out;
        if (res->access_request)
                decode_access(xdr, &res->access_supported, &res->access_result);
-        decode_getfattr(xdr, res->f_attr, res->server);
+        decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
 out:
        return status;
 }
@@ -6307,7 +6420,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
        status = decode_setattr(xdr);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->fattr, res->server);
+        decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
        return status;
 }
@@ -6696,7 +6809,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
        xdr_enter_page(xdr, PAGE_SIZE);
        status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
                                         NULL, res->fs_locations,
-                                         res->fs_locations->server);
+                                         NULL, res->fs_locations->server);
 out:
        return status;
 }
@@ -7109,7 +7222,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
                goto out_overflow;
        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
-                                  NULL, entry->server) < 0)
+                        NULL, entry->label, entry->server) < 0)
                goto out_overflow;
        if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
                entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index a9ebd817278b..e4f9cbfec67b 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -613,8 +613,10 @@ int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
        pd.pgbase = 0;
        pd.pglen = PAGE_SIZE;
        pd.mincount = 0;
+        pd.maxcount = PAGE_SIZE;
-        err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+        err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
+                        pnfslay->plh_lc_cred);
        dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
        if (err)
                goto err_out;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c5bd758e5637..3a3a79d6bf15 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -360,7 +360,7 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 }
 EXPORT_SYMBOL_GPL(pnfs_put_lseg);
-static inline u64
+static u64
 end_offset(u64 start, u64 len)
 {
        u64 end;
@@ -376,9 +376,9 @@ end_offset(u64 start, u64 len)
 *           start2           end2
 *           [----------------)
 */
-static inline int
+static bool
-lo_seg_contained(struct pnfs_layout_range *l1,
+pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
-                 struct pnfs_layout_range *l2)
+                 const struct pnfs_layout_range *l2)
 {
        u64 start1 = l1->offset;
        u64 end1 = end_offset(start1, l1->length);
@@ -395,9 +395,9 @@ lo_seg_contained(struct pnfs_layout_range *l1,
 *                              start2           end2
 *                              [----------------)
 */
-static inline int
+static bool
-lo_seg_intersecting(struct pnfs_layout_range *l1,
+pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
-                    struct pnfs_layout_range *l2)
+                    const struct pnfs_layout_range *l2)
 {
        u64 start1 = l1->offset;
        u64 end1 = end_offset(start1, l1->length);
@@ -409,12 +409,12 @@ lo_seg_intersecting(struct pnfs_layout_range *l1,
 }
 static bool
-should_free_lseg(struct pnfs_layout_range *lseg_range,
+should_free_lseg(const struct pnfs_layout_range *lseg_range,
-                 struct pnfs_layout_range *recall_range)
+                 const struct pnfs_layout_range *recall_range)
 {
        return (recall_range->iomode == IOMODE_ANY ||
                lseg_range->iomode == recall_range->iomode) &&
-               lo_seg_intersecting(lseg_range, recall_range);
+               pnfs_lseg_range_intersecting(lseg_range, recall_range);
 }
 static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
@@ -766,6 +766,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        lgp->args.inode = ino;
        lgp->args.ctx = get_nfs_open_context(ctx);
        lgp->gfp_flags = gfp_flags;
+        lgp->cred = lo->plh_lc_cred;
        /* Synchronously retrieve layout information from server and
         * store in lseg.
@@ -860,6 +861,7 @@ _pnfs_return_layout(struct inode *ino)
        lrp->args.inode = ino;
        lrp->args.layout = lo;
        lrp->clp = NFS_SERVER(ino)->nfs_client;
+        lrp->cred = lo->plh_lc_cred;
        status = nfs4_proc_layoutreturn(lrp);
 out:
@@ -984,8 +986,8 @@ out:
 * are seen first.
 */
 static s64
-cmp_layout(struct pnfs_layout_range *l1,
+pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
-           struct pnfs_layout_range *l2)
+           const struct pnfs_layout_range *l2)
 {
        s64 d;
@@ -1012,7 +1014,7 @@ pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
        dprintk("%s:Begin\n", __func__);
        list_for_each_entry(lp, &lo->plh_segs, pls_list) {
-                if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
+                if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
                        continue;
                list_add_tail(&lseg->pls_list, &lp->pls_list);
                dprintk("%s: inserted lseg %p "
@@ -1050,7 +1052,7 @@ alloc_init_layout_hdr(struct inode *ino,
        INIT_LIST_HEAD(&lo->plh_segs);
        INIT_LIST_HEAD(&lo->plh_bulk_destroy);
        lo->plh_inode = ino;
-        lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
+        lo->plh_lc_cred = get_rpccred(ctx->cred);
        return lo;
 }
@@ -1091,21 +1093,21 @@ out_existing:
 * READ         READ    true
 * READ         RW      true
 */
-static int
+static bool
-is_matching_lseg(struct pnfs_layout_range *ls_range,
+pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
-                 struct pnfs_layout_range *range)
+                 const struct pnfs_layout_range *range)
 {
        struct pnfs_layout_range range1;
        if ((range->iomode == IOMODE_RW &&
             ls_range->iomode != IOMODE_RW) ||
-            !lo_seg_intersecting(ls_range, range))
+            !pnfs_lseg_range_intersecting(ls_range, range))
                return 0;
        /* range1 covers only the first byte in the range */
        range1 = *range;
        range1.length = 1;
-        return lo_seg_contained(ls_range, &range1);
+        return pnfs_lseg_range_contained(ls_range, &range1);
 }
 /*
@@ -1121,7 +1123,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
-                    is_matching_lseg(&lseg->pls_range, range)) {
+                    pnfs_lseg_range_match(&lseg->pls_range, range)) {
                        ret = pnfs_get_lseg(lseg);
                        break;
                }
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f5f8a470a647..a4f41810a7f4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -149,9 +149,10 @@ struct pnfs_device {
        struct nfs4_deviceid dev_id;
        unsigned int  layout_type;
        unsigned int  mincount;
+        unsigned int  maxcount; /* gdia_maxcount */
        struct page **pages;
        unsigned int  pgbase;
-        unsigned int  pglen;
+        unsigned int  pglen;    /* reply buffer length */
 };
 #define NFS4_PNFS_GETDEVLIST_MAXNUM 16
@@ -170,7 +171,8 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
                                   const struct nfs_fh *fh,
                                   struct pnfs_devicelist *devlist);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
-                                   struct pnfs_device *dev);
+                                   struct pnfs_device *dev,
+                                   struct rpc_cred *cred);
 extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index fc8de9016acf..c041c41f7a52 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -98,7 +98,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 */
 static int
 nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
-                struct nfs_fattr *fattr)
+                struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_GETATTR],
@@ -146,7 +146,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 static int
 nfs_proc_lookup(struct inode *dir, struct qstr *name,
-                struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+                struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+                struct nfs4_label *label)
 {
        struct nfs_diropargs    arg = {
                .fh             = NFS_FH(dir),
@@ -243,7 +244,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
        if (status == 0)
-                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
        nfs_free_createdata(data);
 out:
        dprintk("NFS reply create: %d\n", status);
@@ -290,7 +291,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        }
        if (status == 0)
-                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
        nfs_free_createdata(data);
 out:
        dprintk("NFS reply mknod: %d\n", status);
@@ -442,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
         * should fill in the data with a LOOKUP call on the wire.
         */
        if (status == 0)
-                status = nfs_instantiate(dentry, fh, fattr);
+                status = nfs_instantiate(dentry, fh, fattr, NULL);
 out_free:
        nfs_free_fattr(fattr);
@@ -471,7 +472,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
        if (status == 0)
-                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
        nfs_free_createdata(data);
 out:
        dprintk("NFS reply mkdir: %d\n", status);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2d7525fbcf25..f6db66d8f647 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -269,7 +269,7 @@ static match_table_t nfs_local_lock_tokens = {
 enum {
        Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0,
-        Opt_vers_4_1,
+        Opt_vers_4_1, Opt_vers_4_2,
        Opt_vers_err
 };
@@ -280,6 +280,7 @@ static match_table_t nfs_vers_tokens = {
        { Opt_vers_4, "4" },
        { Opt_vers_4_0, "4.0" },
        { Opt_vers_4_1, "4.1" },
+        { Opt_vers_4_2, "4.2" },
        { Opt_vers_err, NULL }
 };
@@ -832,6 +833,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
                seq_printf(m, "\n\tnfsv4:\t");
                seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
                seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+                seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);
                seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
                show_sessions(m, nfss);
                show_pnfs(m, nfss);
@@ -1097,6 +1099,10 @@ static int nfs_parse_version_string(char *string,
                mnt->version = 4;
                mnt->minorversion = 1;
                break;
+        case Opt_vers_4_2:
+                mnt->version = 4;
+                mnt->minorversion = 2;
+                break;
        default:
                return 0;
        }
@@ -1608,29 +1614,13 @@ out_security_failure:
 }
 /*
- * Select a security flavor for this mount.  The selected flavor
+ * Ensure that the specified authtype in args->auth_flavors[0] is supported by
- * is planted in args->auth_flavors[0].
+ * the server. Returns 0 if it's ok, and -EACCES if not.
- *
- * Returns 0 on success, -EACCES on failure.
 */
-static int nfs_select_flavor(struct nfs_parsed_mount_data *args,
+static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
-                              struct nfs_mount_request *request)
+                        rpc_authflavor_t *server_authlist, unsigned int count)
 {
-        unsigned int i, count = *(request->auth_flav_len);
+        unsigned int i;
-        rpc_authflavor_t flavor;
-        /*
-         * The NFSv2 MNT operation does not return a flavor list.
-         */
-        if (args->mount_server.version != NFS_MNT3_VERSION)
-                goto out_default;
-        /*
-         * Certain releases of Linux's mountd return an empty
-         * flavor list in some cases.
-         */
-        if (count == 0)
-                goto out_default;
        /*
         * If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1640,60 +1630,19 @@ static int nfs_select_flavor(struct nfs_parsed_mount_data *args,
         * means that the server will ignore the rpc creds, so any flavor
         * can be used.
         */
-        if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
-                for (i = 0; i < count; i++) {
-                        if (args->auth_flavors[0] == request->auth_flavs[i] ||
-                            request->auth_flavs[i] == RPC_AUTH_NULL)
-                                goto out;
-                }
-                dfprintk(MOUNT, "NFS: auth flavor %d not supported by server\n",
-                        args->auth_flavors[0]);
-                goto out_err;
-        }
-        /*
-         * RFC 2623, section 2.7 suggests we SHOULD prefer the
-         * flavor listed first.  However, some servers list
-         * AUTH_NULL first.  Avoid ever choosing AUTH_NULL.
-         */
-        for (i = 0; i < count; i++) {
-                struct rpcsec_gss_info info;
-                flavor = request->auth_flavs[i];
-                switch (flavor) {
-                case RPC_AUTH_UNIX:
-                        goto out_set;
-                case RPC_AUTH_NULL:
-                        continue;
-                default:
-                        if (rpcauth_get_gssinfo(flavor, &info) == 0)
-                                goto out_set;
-                }
-        }
-        /*
-         * As a last chance, see if the server list contains AUTH_NULL -
-         * if it does, use the default flavor.
-         */
        for (i = 0; i < count; i++) {
-                if (request->auth_flavs[i] == RPC_AUTH_NULL)
+                if (args->auth_flavors[0] == server_authlist[i] ||
-                        goto out_default;
+                    server_authlist[i] == RPC_AUTH_NULL)
+                        goto out;
        }
-        dfprintk(MOUNT, "NFS: no auth flavors in common with server\n");
+        dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
-        goto out_err;
+                args->auth_flavors[0]);
+        return -EACCES;
-out_default:
-        /* use default if flavor not already set */
-        flavor = (args->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) ?
-                RPC_AUTH_UNIX : args->auth_flavors[0];
-out_set:
-        args->auth_flavors[0] = flavor;
 out:
-        dfprintk(MOUNT, "NFS: using auth flavor %d\n", args->auth_flavors[0]);
+        dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
        return 0;
-out_err:
-        return -EACCES;
 }
 /*
@@ -1701,10 +1650,10 @@ out_err:
 * corresponding to the provided path.
 */
 static int nfs_request_mount(struct nfs_parsed_mount_data *args,
-                             struct nfs_fh *root_fh)
+                             struct nfs_fh *root_fh,
+                             rpc_authflavor_t *server_authlist,
+                             unsigned int *server_authlist_len)
 {
-        rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
-        unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
        struct nfs_mount_request request = {
                .sap            = (struct sockaddr *)
                                                &args->mount_server.address,
@@ -1712,7 +1661,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
                .protocol       = args->mount_server.protocol,
                .fh             = root_fh,
                .noresvport     = args->flags & NFS_MOUNT_NORESVPORT,
-                .auth_flav_len  = &server_authlist_len,
+                .auth_flav_len  = server_authlist_len,
                .auth_flavs     = server_authlist,
                .net            = args->net,
        };
@@ -1756,24 +1705,92 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
                return status;
        }
-        return nfs_select_flavor(args, &request);
+        return 0;
 }
-struct dentry *nfs_try_mount(int flags, const char *dev_name,
+static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info,
-                             struct nfs_mount_info *mount_info,
+                                        struct nfs_subversion *nfs_mod)
-                             struct nfs_subversion *nfs_mod)
 {
        int status;
-        struct nfs_server *server;
+        unsigned int i;
+        bool tried_auth_unix = false;
+        bool auth_null_in_list = false;
+        struct nfs_server *server = ERR_PTR(-EACCES);
+        struct nfs_parsed_mount_data *args = mount_info->parsed;
+        rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
+        unsigned int authlist_len = ARRAY_SIZE(authlist);
+        status = nfs_request_mount(args, mount_info->mntfh, authlist,
+                                        &authlist_len);
+        if (status)
+                return ERR_PTR(status);
-        if (mount_info->parsed->need_mount) {
+        /*
-                status = nfs_request_mount(mount_info->parsed, mount_info->mntfh);
+         * Was a sec= authflavor specified in the options? First, verify
+         * whether the server supports it, and then just try to use it if so.
+         */
+        if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
+                status = nfs_verify_authflavor(args, authlist, authlist_len);
+                dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
                if (status)
                        return ERR_PTR(status);
+                return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+        }
+        /*
+         * No sec= option was provided. RFC 2623, section 2.7 suggests we
+         * SHOULD prefer the flavor listed first. However, some servers list
+         * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
+         */
+        for (i = 0; i < authlist_len; ++i) {
+                rpc_authflavor_t flavor;
+                struct rpcsec_gss_info info;
+                flavor = authlist[i];
+                switch (flavor) {
+                case RPC_AUTH_UNIX:
+                        tried_auth_unix = true;
+                        break;
+                case RPC_AUTH_NULL:
+                        auth_null_in_list = true;
+                        continue;
+                default:
+                        if (rpcauth_get_gssinfo(flavor, &info) != 0)
+                                continue;
+                        /* Fallthrough */
+                }
+                dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
+                args->auth_flavors[0] = flavor;
+                server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+                if (!IS_ERR(server))
+                        return server;
        }
-        /* Get a volume representation */
+        /*
-        server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+         * Nothing we tried so far worked. At this point, give up if we've
+         * already tried AUTH_UNIX or if the server's list doesn't contain
+         * AUTH_NULL
+         */
+        if (tried_auth_unix || !auth_null_in_list)
+                return server;
+        /* Last chance! Try AUTH_UNIX */
+        dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
+        args->auth_flavors[0] = RPC_AUTH_UNIX;
+        return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+}
+struct dentry *nfs_try_mount(int flags, const char *dev_name,
+                             struct nfs_mount_info *mount_info,
+                             struct nfs_subversion *nfs_mod)
+{
+        struct nfs_server *server;
+        if (mount_info->parsed->need_mount)
+                server = nfs_try_mount_request(mount_info, nfs_mod);
+        else
+                server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
        if (IS_ERR(server))
                return ERR_CAST(server);
@@ -2412,7 +2429,21 @@ static int nfs_bdi_register(struct nfs_server *server)
 int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
                        struct nfs_mount_info *mount_info)
 {
-        return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts);
+        int error;
+        unsigned long kflags = 0, kflags_out = 0;
+        if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
+                kflags |= SECURITY_LSM_NATIVE_LABELS;
+        error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts,
+                                                kflags, &kflags_out);
+        if (error)
+                goto err;
+        if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
+                !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
+                NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
+err:
+        return error;
 }
 EXPORT_SYMBOL_GPL(nfs_set_sb_security);
@@ -2447,6 +2478,10 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
        if (server->flags & NFS_MOUNT_NOAC)
                sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+        if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL)
+                if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS)
+                        sb_mntdata.mntflags |= MS_SYNCHRONOUS;
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata);
        if (IS_ERR(s)) {
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 1f1f38f0c5d5..60395ad3a2e4 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -479,7 +479,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
        dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
-                dentry->d_count);
+                d_count(dentry));
        nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
        /*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a2c7c28049d5..f1bdb7254776 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -888,6 +888,28 @@ out:
        return PageUptodate(page) != 0;
 }
+/* If we know the page is up to date, and we're not using byte range locks (or
+ * if we have the whole file locked for writing), it may be more efficient to
+ * extend the write to cover the entire page in order to avoid fragmentation
+ * inefficiencies.
+ *
+ * If the file is opened for synchronous writes or if we have a write delegation
+ * from the server then we can just skip the rest of the checks.
+ */
+static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
+{
+        if (file->f_flags & O_DSYNC)
+                return 0;
+        if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
+                return 1;
+        if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL ||
+                        (inode->i_flock->fl_start == 0 &&
+                        inode->i_flock->fl_end == OFFSET_MAX &&
+                        inode->i_flock->fl_type != F_RDLCK)))
+                return 1;
+        return 0;
+}
 /*
 * Update and possibly write a cached page of an NFS file.
 *
@@ -908,14 +930,7 @@ int nfs_updatepage(struct file *file, struct page *page,
                file->f_path.dentry->d_name.name, count,
                (long long)(page_file_offset(page) + offset));
-        /* If we're not using byte range locks, and we know the page
+        if (nfs_can_extend_write(file, page, inode)) {
-         * is up to date, it may be more efficient to extend the write
-         * to cover the entire page in order to avoid fragmentation
-         * inefficiencies.
-         */
-        if (nfs_write_pageuptodate(page, inode) &&
-                        inode->i_flock == NULL &&
-                        !(file->f_flags & O_DSYNC)) {
                count = max(count + offset, nfs_page_length(page));
                offset = 0;
        }
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 430b6872806f..dc8f1ef665ce 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -81,6 +81,22 @@ config NFSD_V4
          If unsure, say N.
+config NFSD_V4_SECURITY_LABEL
+        bool "Provide Security Label support for NFSv4 server"
+        depends on NFSD_V4 && SECURITY
+        help
+        Say Y here if you want enable fine-grained security label attribute
+        support for NFS version 4.  Security labels allow security modules like
+        SELinux and Smack to label files to facilitate enforcement of their policies.
+        Without this an NFSv4 mount will have the same label on each file.
+        If you do not wish to enable fine-grained security labels SELinux or
+        Smack policies on NFSv4 files, say N.
+        WARNING: there is still a chance of backwards-incompatible protocol changes.
+        For now we recommend "Y" only for developers and testers."
 config NFSD_FAULT_INJECTION
        bool "NFS server manual fault injection"
        depends on NFSD_V4 && DEBUG_KERNEL
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 27d74a294515..419572f33b72 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -42,6 +42,36 @@
 #include "current_stateid.h"
 #include "netns.h"
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#include <linux/security.h>
+static inline void
+nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+{
+        struct inode *inode = resfh->fh_dentry->d_inode;
+        int status;
+        mutex_lock(&inode->i_mutex);
+        status = security_inode_setsecctx(resfh->fh_dentry,
+                label->data, label->len);
+        mutex_unlock(&inode->i_mutex);
+        if (status)
+                /*
+                 * XXX: We should really fail the whole open, but we may
+                 * already have created a new file, so it may be too
+                 * late.  For now this seems the least of evils:
+                 */
+                bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+        return;
+}
+#else
+static inline void
+nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+{ }
+#endif
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 static u32 nfsd_attrmask[] = {
@@ -239,6 +269,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
                                        (u32 *)open->op_verf.data,
                                        &open->op_truncate, &open->op_created);
+                if (!status && open->op_label.len)
+                        nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval);
                /*
                 * Following rfc 3530 14.2.16, use the returned bitmask
                 * to indicate which attributes we used to store the
@@ -263,7 +296,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
        nfsd4_set_open_owner_reply_cache(cstate, open, resfh);
        accmode = NFSD_MAY_NOP;
-        if (open->op_created)
+        if (open->op_created ||
+                        open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
                accmode |= NFSD_MAY_OWNER_OVERRIDE;
        status = do_open_permission(rqstp, resfh, open, accmode);
        set_change_info(&open->op_cinfo, current_fh);
@@ -637,6 +671,9 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                goto out;
+        if (create->cr_label.len)
+                nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval);
        if (create->cr_acl != NULL)
                do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
                                create->cr_bmval);
@@ -916,6 +953,11 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                                            setattr->sa_acl);
        if (status)
                goto out;
+        if (setattr->sa_label.len)
+                status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh,
+                                &setattr->sa_label);
+        if (status)
+                goto out;
        status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
                                0, (time_t)0);
 out:
@@ -1251,7 +1293,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
         * According to RFC3010, this takes precedence over all other errors.
         */
        status = nfserr_minor_vers_mismatch;
-        if (args->minorversion > nfsd_supported_minorversion)
+        if (nfsd_minorversion(args->minorversion, NFSD_TEST) <= 0)
                goto out;
        status = nfs41_check_op_ordering(args);
@@ -1482,7 +1524,7 @@ static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
        return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
-                1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\
+                1 + 1 + 2 + /* eir_flags, spr_how, spo_must_enforce & _allow */\
                2 + /*eir_server_owner.so_minor_id */\
                /* eir_server_owner.so_major_id<> */\
                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 4e9a21db867a..105a3b080d12 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -240,11 +240,16 @@ struct name_list {
        struct list_head list;
 };
+struct nfs4_dir_ctx {
+        struct dir_context ctx;
+        struct list_head names;
+};
 static int
 nfsd4_build_namelist(void *arg, const char *name, int namlen,
                loff_t offset, u64 ino, unsigned int d_type)
 {
-        struct list_head *names = arg;
+        struct nfs4_dir_ctx *ctx = arg;
        struct name_list *entry;
        if (namlen != HEXDIR_LEN - 1)
@@ -254,7 +259,7 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,
                return -ENOMEM;
        memcpy(entry->name, name, HEXDIR_LEN - 1);
        entry->name[HEXDIR_LEN - 1] = '\0';
-        list_add(&entry->list, names);
+        list_add(&entry->list, &ctx->names);
        return 0;
 }
@@ -263,7 +268,10 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
 {
        const struct cred *original_cred;
        struct dentry *dir = nn->rec_file->f_path.dentry;
-        LIST_HEAD(names);
+        struct nfs4_dir_ctx ctx = {
+                .ctx.actor = nfsd4_build_namelist,
+                .names = LIST_HEAD_INIT(ctx.names)
+        };
        int status;
        status = nfs4_save_creds(&original_cred);
@@ -276,11 +284,11 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
                return status;
        }
-        status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names);
+        status = iterate_dir(nn->rec_file, &ctx.ctx);
        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-        while (!list_empty(&names)) {
+        while (!list_empty(&ctx.names)) {
                struct name_list *entry;
-                entry = list_entry(names.next, struct name_list, list);
+                entry = list_entry(ctx.names.next, struct name_list, list);
                if (!status) {
                        struct dentry *dentry;
                        dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 316ec843dec2..43f42290e5df 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -97,19 +97,20 @@ nfs4_lock_state(void)
 static void free_session(struct nfsd4_session *);
-void nfsd4_put_session(struct nfsd4_session *ses)
+static bool is_session_dead(struct nfsd4_session *ses)
 {
-        atomic_dec(&ses->se_ref);
+        return ses->se_flags & NFS4_SESSION_DEAD;
 }
-static bool is_session_dead(struct nfsd4_session *ses)
+void nfsd4_put_session(struct nfsd4_session *ses)
 {
-        return ses->se_flags & NFS4_SESSION_DEAD;
+        if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
+                free_session(ses);
 }
-static __be32 mark_session_dead_locked(struct nfsd4_session *ses)
+static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me)
 {
-        if (atomic_read(&ses->se_ref))
+        if (atomic_read(&ses->se_ref) > ref_held_by_me)
                return nfserr_jukebox;
        ses->se_flags |= NFS4_SESSION_DEAD;
        return nfs_ok;
@@ -364,19 +365,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
 }
 static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
 {
        struct nfs4_delegation *dp;
        struct nfs4_file *fp = stp->st_file;
        dprintk("NFSD alloc_init_deleg\n");
-        /*
-         * Major work on the lease subsystem (for example, to support
-         * calbacks on stat) will be required before we can support
-         * write delegations properly.
-         */
-        if (type != NFS4_OPEN_DELEGATE_READ)
-                return NULL;
        if (fp->fi_had_conflict)
                return NULL;
        if (num_delegations > max_delegations)
@@ -397,7 +391,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
        INIT_LIST_HEAD(&dp->dl_recall_lru);
        get_nfs4_file(fp);
        dp->dl_file = fp;
-        dp->dl_type = type;
+        dp->dl_type = NFS4_OPEN_DELEGATE_READ;
        fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
        dp->dl_time = 0;
        atomic_set(&dp->dl_count, 1);
@@ -1188,6 +1182,9 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
        target->cr_gid = source->cr_gid;
        target->cr_group_info = source->cr_group_info;
        get_group_info(target->cr_group_info);
+        target->cr_gss_mech = source->cr_gss_mech;
+        if (source->cr_gss_mech)
+                gss_mech_get(source->cr_gss_mech);
        return 0;
 }
@@ -1262,6 +1259,33 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
        return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
 }
+static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp)
+{
+        struct svc_cred *cr = &rqstp->rq_cred;
+        u32 service;
+        if (!cr->cr_gss_mech)
+                return false;
+        service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor);
+        return service == RPC_GSS_SVC_INTEGRITY ||
+               service == RPC_GSS_SVC_PRIVACY;
+}
+static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
+{
+        struct svc_cred *cr = &rqstp->rq_cred;
+        if (!cl->cl_mach_cred)
+                return true;
+        if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech)
+                return false;
+        if (!svc_rqst_integrity_protected(rqstp))
+                return false;
+        if (!cr->cr_principal)
+                return false;
+        return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
+}
 static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
 {
        static u32 current_clientid = 1;
@@ -1639,16 +1663,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
        if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
                return nfserr_inval;
-        /* Currently only support SP4_NONE */
        switch (exid->spa_how) {
+        case SP4_MACH_CRED:
+                if (!svc_rqst_integrity_protected(rqstp))
+                        return nfserr_inval;
        case SP4_NONE:
                break;
        default:                                /* checked by xdr code */
                WARN_ON_ONCE(1);
        case SP4_SSV:
                return nfserr_encr_alg_unsupp;
-        case SP4_MACH_CRED:
-                return nfserr_serverfault;      /* no excuse :-/ */
        }
        /* Cases below refer to rfc 5661 section 18.35.4: */
@@ -1663,6 +1687,10 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                                status = nfserr_inval;
                                goto out;
                        }
+                        if (!mach_creds_match(conf, rqstp)) {
+                                status = nfserr_wrong_cred;
+                                goto out;
+                        }
                        if (!creds_match) { /* case 9 */
                                status = nfserr_perm;
                                goto out;
@@ -1709,7 +1737,8 @@ out_new:
                status = nfserr_jukebox;
                goto out;
        }
-        new->cl_minorversion = 1;
+        new->cl_minorversion = cstate->minorversion;
+        new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
        gen_clid(new, nn);
        add_to_unconfirmed(new);
@@ -1839,6 +1868,24 @@ static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
        return nfs_ok;
 }
+static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs)
+{
+        switch (cbs->flavor) {
+        case RPC_AUTH_NULL:
+        case RPC_AUTH_UNIX:
+                return nfs_ok;
+        default:
+                /*
+                 * GSS case: the spec doesn't allow us to return this
+                 * error.  But it also doesn't allow us not to support
+                 * GSS.
+                 * I'd rather this fail hard than return some error the
+                 * client might think it can already handle:
+                 */
+                return nfserr_encr_alg_unsupp;
+        }
+}
 __be32
 nfsd4_create_session(struct svc_rqst *rqstp,
                     struct nfsd4_compound_state *cstate,
@@ -1854,6 +1901,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
                return nfserr_inval;
+        status = nfsd4_check_cb_sec(&cr_ses->cb_sec);
+        if (status)
+                return status;
        status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
        if (status)
                return status;
@@ -1874,6 +1924,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        WARN_ON_ONCE(conf && unconf);
        if (conf) {
+                status = nfserr_wrong_cred;
+                if (!mach_creds_match(conf, rqstp))
+                        goto out_free_conn;
                cs_slot = &conf->cl_cs_slot;
                status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
                if (status == nfserr_replay_cache) {
@@ -1890,6 +1943,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                        status = nfserr_clid_inuse;
                        goto out_free_conn;
                }
+                status = nfserr_wrong_cred;
+                if (!mach_creds_match(unconf, rqstp))
+                        goto out_free_conn;
                cs_slot = &unconf->cl_cs_slot;
                status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
                if (status) {
@@ -1957,7 +2013,11 @@ __be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state
 {
        struct nfsd4_session *session = cstate->session;
        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+        __be32 status;
+        status = nfsd4_check_cb_sec(&bc->bc_cb_sec);
+        if (status)
+                return status;
        spin_lock(&nn->client_lock);
        session->se_cb_prog = bc->bc_cb_program;
        session->se_cb_sec = bc->bc_cb_sec;
@@ -1986,6 +2046,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
        status = nfserr_badsession;
        if (!session)
                goto out;
+        status = nfserr_wrong_cred;
+        if (!mach_creds_match(session->se_client, rqstp))
+                goto out;
        status = nfsd4_map_bcts_dir(&bcts->dir);
        if (status)
                goto out;
@@ -2014,6 +2077,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
 {
        struct nfsd4_session *ses;
        __be32 status;
+        int ref_held_by_me = 0;
        struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
        nfs4_lock_state();
@@ -2021,6 +2085,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
        if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
                if (!nfsd4_last_compound_op(r))
                        goto out;
+                ref_held_by_me++;
        }
        dump_sessionid(__func__, &sessionid->sessionid);
        spin_lock(&nn->client_lock);
@@ -2028,17 +2093,22 @@ nfsd4_destroy_session(struct svc_rqst *r,
        status = nfserr_badsession;
        if (!ses)
                goto out_client_lock;
-        status = mark_session_dead_locked(ses);
+        status = nfserr_wrong_cred;
-        if (status)
+        if (!mach_creds_match(ses->se_client, r))
                goto out_client_lock;
+        nfsd4_get_session_locked(ses);
+        status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
+        if (status)
+                goto out_put_session;
        unhash_session(ses);
        spin_unlock(&nn->client_lock);
        nfsd4_probe_callback_sync(ses->se_client);
        spin_lock(&nn->client_lock);
-        free_session(ses);
        status = nfs_ok;
+out_put_session:
+        nfsd4_put_session(ses);
 out_client_lock:
        spin_unlock(&nn->client_lock);
 out:
@@ -2058,26 +2128,31 @@ static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_s
        return NULL;
 }
-static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
+static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
 {
        struct nfs4_client *clp = ses->se_client;
        struct nfsd4_conn *c;
+        __be32 status = nfs_ok;
        int ret;
        spin_lock(&clp->cl_lock);
        c = __nfsd4_find_conn(new->cn_xprt, ses);
-        if (c) {
+        if (c)
-                spin_unlock(&clp->cl_lock);
+                goto out_free;
-                free_conn(new);
+        status = nfserr_conn_not_bound_to_session;
-                return;
+        if (clp->cl_mach_cred)
-        }
+                goto out_free;
        __nfsd4_hash_conn(new, ses);
        spin_unlock(&clp->cl_lock);
        ret = nfsd4_register_conn(new);
        if (ret)
                /* oops; xprt is already down: */
                nfsd4_conn_lost(&new->cn_xpt_user);
-        return;
+        return nfs_ok;
+out_free:
+        spin_unlock(&clp->cl_lock);
+        free_conn(new);
+        return status;
 }
 static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
@@ -2169,8 +2244,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        if (status)
                goto out_put_session;
-        nfsd4_sequence_check_conn(conn, session);
+        status = nfsd4_sequence_check_conn(conn, session);
        conn = NULL;
+        if (status)
+                goto out_put_session;
        /* Success! bump slot seqid */
        slot->sl_seqid = seq->seqid;
@@ -2232,7 +2309,10 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
                status = nfserr_stale_clientid;
                goto out;
        }
+        if (!mach_creds_match(clp, rqstp)) {
+                status = nfserr_wrong_cred;
+                goto out;
+        }
        expire_client(clp);
 out:
        nfs4_unlock_state();
@@ -2645,13 +2725,13 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
        list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
-        /* only place dl_time is set. protected by lock_flocks*/
+        /* Only place dl_time is set; protected by i_lock: */
        dp->dl_time = get_seconds();
        nfsd4_cb_recall(dp);
 }
-/* Called from break_lease() with lock_flocks() held. */
+/* Called from break_lease() with i_lock held. */
 static void nfsd_break_deleg_cb(struct file_lock *fl)
 {
        struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
@@ -2940,13 +3020,13 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f
        return fl;
 }
-static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
+static int nfs4_setlease(struct nfs4_delegation *dp)
 {
        struct nfs4_file *fp = dp->dl_file;
        struct file_lock *fl;
        int status;
-        fl = nfs4_alloc_init_lease(dp, flag);
+        fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
        if (!fl)
                return -ENOMEM;
        fl->fl_file = find_readable_file(fp);
@@ -2964,12 +3044,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
        return 0;
 }
-static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
+static int nfs4_set_delegation(struct nfs4_delegation *dp)
 {
        struct nfs4_file *fp = dp->dl_file;
        if (!fp->fi_lease)
-                return nfs4_setlease(dp, flag);
+                return nfs4_setlease(dp);
        spin_lock(&recall_lock);
        if (fp->fi_had_conflict) {
                spin_unlock(&recall_lock);
@@ -3005,6 +3085,9 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
 /*
 * Attempt to hand out a delegation.
+ *
+ * Note we don't support write delegations, and won't until the vfs has
+ * proper support for them.
 */
 static void
 nfs4_open_delegation(struct net *net, struct svc_fh *fh,
@@ -3013,39 +3096,45 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
        struct nfs4_delegation *dp;
        struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
        int cb_up;
-        int status = 0, flag = 0;
+        int status = 0;
        cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
-        flag = NFS4_OPEN_DELEGATE_NONE;
        open->op_recall = 0;
        switch (open->op_claim_type) {
                case NFS4_OPEN_CLAIM_PREVIOUS:
                        if (!cb_up)
                                open->op_recall = 1;
-                        flag = open->op_delegate_type;
+                        if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ)
-                        if (flag == NFS4_OPEN_DELEGATE_NONE)
+                                goto out_no_deleg;
-                                goto out;
                        break;
                case NFS4_OPEN_CLAIM_NULL:
-                        /* Let's not give out any delegations till everyone's
+                        /*
-                         * had the chance to reclaim theirs.... */
+                         * Let's not give out any delegations till everyone's
+                         * had the chance to reclaim theirs....
+                         */
                        if (locks_in_grace(net))
-                                goto out;
+                                goto out_no_deleg;
                        if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
-                                goto out;
+                                goto out_no_deleg;
+                        /*
+                         * Also, if the file was opened for write or
+                         * create, there's a good chance the client's
+                         * about to write to it, resulting in an
+                         * immediate recall (since we don't support
+                         * write delegations):
+                         */
                        if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-                                flag = NFS4_OPEN_DELEGATE_WRITE;
+                                goto out_no_deleg;
-                        else
+                        if (open->op_create == NFS4_OPEN_CREATE)
-                                flag = NFS4_OPEN_DELEGATE_READ;
+                                goto out_no_deleg;
                        break;
                default:
-                        goto out;
+                        goto out_no_deleg;
        }
+        dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh);
-        dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
        if (dp == NULL)
                goto out_no_deleg;
-        status = nfs4_set_delegation(dp, flag);
+        status = nfs4_set_delegation(dp);
        if (status)
                goto out_free;
@@ -3053,24 +3142,23 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
        dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
                STATEID_VAL(&dp->dl_stid.sc_stateid));
-out:
+        open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
-        open->op_delegate_type = flag;
-        if (flag == NFS4_OPEN_DELEGATE_NONE) {
-                if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
-                    open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
-                        dprintk("NFSD: WARNING: refusing delegation reclaim\n");
-                /* 4.1 client asking for a delegation? */
-                if (open->op_deleg_want)
-                        nfsd4_open_deleg_none_ext(open, status);
-        }
        return;
 out_free:
        unhash_stid(&dp->dl_stid);
        nfs4_put_delegation(dp);
 out_no_deleg:
-        flag = NFS4_OPEN_DELEGATE_NONE;
+        open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
-        goto out;
+        if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
+            open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
+                dprintk("NFSD: WARNING: refusing delegation reclaim\n");
+                open->op_recall = 1;
+        }
+        /* 4.1 client asking for a delegation? */
+        if (open->op_deleg_want)
+                nfsd4_open_deleg_none_ext(open, status);
+        return;
 }
 static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
@@ -3427,7 +3515,7 @@ grace_disallows_io(struct net *net, struct inode *inode)
 /* Returns true iff a is later than b: */
 static bool stateid_generation_after(stateid_t *a, stateid_t *b)
 {
-        return (s32)a->si_generation - (s32)b->si_generation > 0;
+        return (s32)(a->si_generation - b->si_generation) > 0;
 }
 static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
@@ -4435,7 +4523,6 @@ __be32
 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
            struct nfsd4_locku *locku)
 {
-        struct nfs4_lockowner *lo;
        struct nfs4_ol_stateid *stp;
        struct file *filp = NULL;
        struct file_lock *file_lock = NULL;
@@ -4468,10 +4555,9 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                status = nfserr_jukebox;
                goto out;
        }
-        lo = lockowner(stp->st_stateowner);
        locks_init_lock(file_lock);
        file_lock->fl_type = F_UNLCK;
-        file_lock->fl_owner = (fl_owner_t)lo;
+        file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
        file_lock->fl_pid = current->tgid;
        file_lock->fl_file = filp;
        file_lock->fl_flags = FL_POSIX;
@@ -4490,11 +4576,6 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        update_stateid(&stp->st_stid.sc_stateid);
        memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
-        if (nfsd4_has_session(cstate) && !check_for_locks(stp->st_file, lo)) {
-                WARN_ON_ONCE(cstate->replay_owner);
-                release_lockowner(lo);
-        }
 out:
        nfsd4_bump_seqid(cstate, status);
        if (!cstate->replay_owner)
@@ -4520,7 +4601,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
        struct inode *inode = filp->fi_inode;
        int status = 0;
-        lock_flocks();
+        spin_lock(&inode->i_lock);
        for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
                if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
                        status = 1;
@@ -4528,7 +4609,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
                }
        }
 out:
-        unlock_flocks();
+        spin_unlock(&inode->i_lock);
        return status;
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6cd86e0fe450..c2a4701d7286 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -55,6 +55,11 @@
 #include "cache.h"
 #include "netns.h"
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#include <linux/security.h>
+#endif
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
 /*
@@ -134,6 +139,19 @@ xdr_error:					\
        }                                       \
 } while (0)
+static void next_decode_page(struct nfsd4_compoundargs *argp)
+{
+        argp->pagelist++;
+        argp->p = page_address(argp->pagelist[0]);
+        if (argp->pagelen < PAGE_SIZE) {
+                argp->end = argp->p + (argp->pagelen>>2);
+                argp->pagelen = 0;
+        } else {
+                argp->end = argp->p + (PAGE_SIZE>>2);
+                argp->pagelen -= PAGE_SIZE;
+        }
+}
 static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
 {
        /* We want more bytes than seem to be available.
@@ -161,16 +179,7 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
         * guarantee p points to at least nbytes bytes.
         */
        memcpy(p, argp->p, avail);
-        /* step to next page */
+        next_decode_page(argp);
-        argp->p = page_address(argp->pagelist[0]);
-        argp->pagelist++;
-        if (argp->pagelen < PAGE_SIZE) {
-                argp->end = argp->p + (argp->pagelen>>2);
-                argp->pagelen = 0;
-        } else {
-                argp->end = argp->p + (PAGE_SIZE>>2);
-                argp->pagelen -= PAGE_SIZE;
-        }
        memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
        argp->p += XDR_QUADLEN(nbytes - avail);
        return p;
@@ -242,7 +251,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 static __be32
 nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
-                   struct iattr *iattr, struct nfs4_acl **acl)
+                   struct iattr *iattr, struct nfs4_acl **acl,
+                   struct xdr_netobj *label)
 {
        int expected_len, len = 0;
        u32 dummy32;
@@ -380,6 +390,32 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                        goto xdr_error;
                }
        }
+        label->len = 0;
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+        if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
+                READ_BUF(4);
+                len += 4;
+                READ32(dummy32); /* lfs: we don't use it */
+                READ_BUF(4);
+                len += 4;
+                READ32(dummy32); /* pi: we don't use it either */
+                READ_BUF(4);
+                len += 4;
+                READ32(dummy32);
+                READ_BUF(dummy32);
+                if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN)
+                        return nfserr_badlabel;
+                len += (XDR_QUADLEN(dummy32) << 2);
+                READMEM(buf, dummy32);
+                label->data = kzalloc(dummy32 + 1, GFP_KERNEL);
+                if (!label->data)
+                        return nfserr_jukebox;
+                defer_free(argp, kfree, label->data);
+                memcpy(label->data, buf, dummy32);
+        }
+#endif
        if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
            || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
            || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
@@ -428,7 +464,11 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
        /* callback_sec_params4 */
        READ_BUF(4);
        READ32(nr_secflavs);
-        cbs->flavor = (u32)(-1);
+        if (nr_secflavs)
+                cbs->flavor = (u32)(-1);
+        else
+                /* Is this legal? Be generous, take it to mean AUTH_NONE: */
+                cbs->flavor = 0;
        for (i = 0; i < nr_secflavs; ++i) {
                READ_BUF(4);
                READ32(dummy);
@@ -576,7 +616,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
                return status;
        status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
-                                    &create->cr_acl);
+                                    &create->cr_acl, &create->cr_label);
        if (status)
                goto out;
@@ -827,7 +867,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
                case NFS4_CREATE_UNCHECKED:
                case NFS4_CREATE_GUARDED:
                        status = nfsd4_decode_fattr(argp, open->op_bmval,
-                                &open->op_iattr, &open->op_acl);
+                                &open->op_iattr, &open->op_acl, &open->op_label);
                        if (status)
                                goto out;
                        break;
@@ -841,7 +881,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
                        READ_BUF(NFS4_VERIFIER_SIZE);
                        COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
                        status = nfsd4_decode_fattr(argp, open->op_bmval,
-                                &open->op_iattr, &open->op_acl);
+                                &open->op_iattr, &open->op_acl, &open->op_label);
                        if (status)
                                goto out;
                        break;
@@ -1063,7 +1103,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
        if (status)
                return status;
        return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
-                                  &setattr->sa_acl);
+                                  &setattr->sa_acl, &setattr->sa_label);
 }
 static __be32
@@ -1567,6 +1607,7 @@ struct nfsd4_minorversion_ops {
 static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
        [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
        [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
+        [2] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
 };
 static __be32
@@ -1953,6 +1994,36 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
                              FATTR4_WORD0_RDATTR_ERROR)
 #define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+static inline __be32
+nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
+{
+        __be32 *p = *pp;
+        if (*buflen < ((XDR_QUADLEN(len) << 2) + 4 + 4 + 4))
+                return nfserr_resource;
+        /*
+         * For now we use a 0 here to indicate the null translation; in
+         * the future we may place a call to translation code here.
+         */
+        if ((*buflen -= 8) < 0)
+                return nfserr_resource;
+        WRITE32(0); /* lfs */
+        WRITE32(0); /* pi */
+        p = xdr_encode_opaque(p, context, len);
+        *buflen -= (XDR_QUADLEN(len) << 2) + 4;
+        *pp = p;
+        return 0;
+}
+#else
+static inline __be32
+nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
+{ return 0; }
+#endif
 static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
 {
        /* As per referral draft:  */
@@ -2012,6 +2083,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        int err;
        int aclsupport = 0;
        struct nfs4_acl *acl = NULL;
+        void *context = NULL;
+        int contextlen;
+        bool contextsupport = false;
        struct nfsd4_compoundres *resp = rqstp->rq_resp;
        u32 minorversion = resp->cstate.minorversion;
        struct path path = {
@@ -2065,6 +2139,21 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                }
        }
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+        if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) ||
+                        bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
+                err = security_inode_getsecctx(dentry->d_inode,
+                                                &context, &contextlen);
+                contextsupport = (err == 0);
+                if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+                        if (err == -EOPNOTSUPP)
+                                bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+                        else if (err)
+                                goto out_nfserr;
+                }
+        }
+#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
        if (bmval2) {
                if ((buflen -= 16) < 0)
                        goto out_resource;
@@ -2093,6 +2182,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                if (!aclsupport)
                        word0 &= ~FATTR4_WORD0_ACL;
+                if (!contextsupport)
+                        word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
                if (!word2) {
                        if ((buflen -= 12) < 0)
                                goto out_resource;
@@ -2400,6 +2491,12 @@ out_acl:
                        get_parent_attributes(exp, &stat);
                WRITE64(stat.ino);
        }
+        if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+                status = nfsd4_encode_security_label(rqstp, context,
+                                contextlen, &p, &buflen);
+                if (status)
+                        goto out;
+        }
        if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
                WRITE32(3);
                WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
@@ -2412,6 +2509,10 @@ out_acl:
        status = nfs_ok;
 out:
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+        if (context)
+                security_release_secctx(context, contextlen);
+#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
        kfree(acl);
        if (fhp == &tempfh)
                fh_put(&tempfh);
@@ -3176,16 +3277,18 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 {
        __be32 *p;
-        RESERVE_SPACE(12);
+        RESERVE_SPACE(16);
        if (nfserr) {
-                WRITE32(2);
+                WRITE32(3);
+                WRITE32(0);
                WRITE32(0);
                WRITE32(0);
        }
        else {
-                WRITE32(2);
+                WRITE32(3);
                WRITE32(setattr->sa_bmval[0]);
                WRITE32(setattr->sa_bmval[1]);
+                WRITE32(setattr->sa_bmval[2]);
        }
        ADJUST_ARGS();
        return nfserr;
@@ -3226,6 +3329,14 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
        return nfserr;
 }
+static const u32 nfs4_minimal_spo_must_enforce[2] = {
+        [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+              1 << (OP_EXCHANGE_ID - 32) |
+              1 << (OP_CREATE_SESSION - 32) |
+              1 << (OP_DESTROY_SESSION - 32) |
+              1 << (OP_DESTROY_CLIENTID - 32)
+};
 static __be32
 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
                         struct nfsd4_exchange_id *exid)
@@ -3249,7 +3360,8 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
                8 /* eir_clientid */ +
                4 /* eir_sequenceid */ +
                4 /* eir_flags */ +
-                4 /* spr_how (SP4_NONE) */ +
+                4 /* spr_how */ +
+                8 /* spo_must_enforce, spo_must_allow */ +
                8 /* so_minor_id */ +
                4 /* so_major_id.len */ +
                (XDR_QUADLEN(major_id_sz) * 4) +
@@ -3261,9 +3373,21 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
        WRITE32(exid->seqid);
        WRITE32(exid->flags);
-        /* state_protect4_r. Currently only support SP4_NONE */
-        BUG_ON(exid->spa_how != SP4_NONE);
        WRITE32(exid->spa_how);
+        switch (exid->spa_how) {
+        case SP4_NONE:
+                break;
+        case SP4_MACH_CRED:
+                /* spo_must_enforce bitmap: */
+                WRITE32(2);
+                WRITE32(nfs4_minimal_spo_must_enforce[0]);
+                WRITE32(nfs4_minimal_spo_must_enforce[1]);
+                /* empty spo_must_allow bitmap: */
+                WRITE32(0);
+                break;
+        default:
+                WARN_ON_ONCE(1);
+        }
        /* The server_owner struct */
        WRITE64(minor_id);      /* Minor id */
@@ -3635,13 +3759,17 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
        iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
        BUG_ON(iov->iov_len > PAGE_SIZE);
        if (nfsd4_has_session(cs)) {
+                struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+                struct nfs4_client *clp = cs->session->se_client;
                if (cs->status != nfserr_replay_cache) {
                        nfsd4_store_cache_entry(resp);
                        cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
                }
                /* Renew the clientid on success and on replay */
-                put_client_renew(cs->session->se_client);
+                spin_lock(&nn->client_lock);
                nfsd4_put_session(cs->session);
+                spin_unlock(&nn->client_lock);
+                put_client_renew(clp);
        }
        return 1;
 }
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 07a473fd49bc..30f34ab02137 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -24,7 +24,7 @@
 /*
 * nfsd version
 */
-#define NFSD_SUPPORTED_MINOR_VERSION    1
+#define NFSD_SUPPORTED_MINOR_VERSION    2
 /*
 * Maximum blocksizes supported by daemon under various circumstances.
 */
@@ -53,7 +53,6 @@ struct readdir_cd {
 extern struct svc_program       nfsd_program;
 extern struct svc_version       nfsd_version2, nfsd_version3,
                                nfsd_version4;
-extern u32                      nfsd_supported_minorversion;
 extern struct mutex             nfsd_mutex;
 extern spinlock_t               nfsd_drc_lock;
 extern unsigned long            nfsd_drc_max_mem;
@@ -243,6 +242,12 @@ void		nfsd_lockd_shutdown(void);
 #define nfserr_reject_deleg             cpu_to_be32(NFS4ERR_REJECT_DELEG)
 #define nfserr_returnconflict           cpu_to_be32(NFS4ERR_RETURNCONFLICT)
 #define nfserr_deleg_revoked            cpu_to_be32(NFS4ERR_DELEG_REVOKED)
+#define nfserr_partner_notsupp          cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
+#define nfserr_partner_no_auth          cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
+#define nfserr_metadata_notsupp         cpu_to_be32(NFS4ERR_METADATA_NOTSUPP)
+#define nfserr_offload_denied           cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
+#define nfserr_wrong_lfs                cpu_to_be32(NFS4ERR_WRONG_LFS)
+#define nfserr_badlabel         cpu_to_be32(NFS4ERR_BADLABEL)
 /* error codes for internal use */
 /* if a request fails due to kmalloc failure, it gets dropped.
@@ -322,6 +327,13 @@ void		nfsd_lockd_shutdown(void);
 #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
        (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
+        (NFSD4_1_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SECURITY_LABEL)
+#else
+#define NFSD4_2_SUPPORTED_ATTRS_WORD2 0
+#endif
 static inline u32 nfsd_suppattrs0(u32 minorversion)
 {
        return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
@@ -336,8 +348,11 @@ static inline u32 nfsd_suppattrs1(u32 minorversion)
 static inline u32 nfsd_suppattrs2(u32 minorversion)
 {
-        return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2
+        switch (minorversion) {
-                            : NFSD4_SUPPORTED_ATTRS_WORD2;
+        default: return NFSD4_2_SUPPORTED_ATTRS_WORD2;
+        case 1:  return NFSD4_1_SUPPORTED_ATTRS_WORD2;
+        case 0:  return NFSD4_SUPPORTED_ATTRS_WORD2;
+        }
 }
 /* These will return ERR_INVAL if specified in GETATTR or READDIR. */
@@ -350,7 +365,11 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
 #define NFSD_WRITEABLE_ATTRS_WORD1 \
        (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
        | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL
+#else
 #define NFSD_WRITEABLE_ATTRS_WORD2 0
+#endif
 #define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
        NFSD_WRITEABLE_ATTRS_WORD0
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 262df5ccbf59..760c85a6f534 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -116,7 +116,10 @@ struct svc_program		nfsd_program = {
 };
-u32 nfsd_supported_minorversion;
+static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
+        [0] = 1,
+        [1] = 1,
+};
 int nfsd_vers(int vers, enum vers_op change)
 {
@@ -151,15 +154,13 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
                return -1;
        switch(change) {
        case NFSD_SET:
-                nfsd_supported_minorversion = minorversion;
+                nfsd_supported_minorversions[minorversion] = true;
                break;
        case NFSD_CLEAR:
-                if (minorversion == 0)
+                nfsd_supported_minorversions[minorversion] = false;
-                        return -1;
-                nfsd_supported_minorversion = minorversion - 1;
                break;
        case NFSD_TEST:
-                return minorversion <= nfsd_supported_minorversion;
+                return nfsd_supported_minorversions[minorversion];
        case NFSD_AVAIL:
                return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
        }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 274e2a114e05..424d8f5f2317 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -246,6 +246,7 @@ struct nfs4_client {
        nfs4_verifier           cl_verifier;    /* generated by client */
        time_t                  cl_time;        /* time of last lease renewal */
        struct sockaddr_storage cl_addr;        /* client ipaddress */
+        bool                    cl_mach_cred;   /* SP4_MACH_CRED in force */
        struct svc_cred         cl_cred;        /* setclientid principal */
        clientid_t              cl_clientid;    /* generated by server */
        nfs4_verifier           cl_confirm;     /* generated by server */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 84ce601d8063..c827acb0e943 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -28,6 +28,7 @@
 #include <asm/uaccess.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
+#include <linux/security.h>
 #ifdef CONFIG_NFSD_V3
 #include "xdr3.h"
@@ -621,6 +622,33 @@ int nfsd4_is_junction(struct dentry *dentry)
                return 0;
        return 1;
 }
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+                struct xdr_netobj *label)
+{
+        __be32 error;
+        int host_error;
+        struct dentry *dentry;
+        error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
+        if (error)
+                return error;
+        dentry = fhp->fh_dentry;
+        mutex_lock(&dentry->d_inode->i_mutex);
+        host_error = security_inode_setsecctx(dentry, label->data, label->len);
+        mutex_unlock(&dentry->d_inode->i_mutex);
+        return nfserrno(host_error);
+}
+#else
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+                struct xdr_netobj *label)
+{
+        return nfserr_notsupp;
+}
+#endif
 #endif /* defined(CONFIG_NFSD_V4) */
 #ifdef CONFIG_NFSD_V3
@@ -802,9 +830,10 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
                        flags = O_WRONLY|O_LARGEFILE;
        }
        *filp = dentry_open(&path, flags, current_cred());
-        if (IS_ERR(*filp))
+        if (IS_ERR(*filp)) {
                host_err = PTR_ERR(*filp);
-        else {
+                *filp = NULL;
+        } else {
                host_err = ima_file_check(*filp, may_flags);
                if (may_flags & NFSD_MAY_64BIT_COOKIE)
@@ -1912,6 +1941,7 @@ struct buffered_dirent {
 };
 struct readdir_data {
+        struct dir_context ctx;
        char            *dirent;
        size_t          used;
        int             full;
@@ -1943,13 +1973,15 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
 static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
                                    struct readdir_cd *cdp, loff_t *offsetp)
 {
-        struct readdir_data buf;
        struct buffered_dirent *de;
        int host_err;
        int size;
        loff_t offset;
+        struct readdir_data buf = {
+                .ctx.actor = nfsd_buffered_filldir,
+                .dirent = (void *)__get_free_page(GFP_KERNEL)
+        };
-        buf.dirent = (void *)__get_free_page(GFP_KERNEL);
        if (!buf.dirent)
                return nfserrno(-ENOMEM);
@@ -1963,7 +1995,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
                buf.used = 0;
                buf.full = 0;
-                host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf);
+                host_err = iterate_dir(file, &buf.ctx);
                if (buf.full)
                        host_err = 0;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 5b5894159f22..a4be2e389670 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -39,7 +39,6 @@
 typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
 /* nfsd/vfs.c */
-int             fh_lock_parent(struct svc_fh *, struct dentry *);
 int             nfsd_racache_init(int);
 void            nfsd_racache_shutdown(void);
 int             nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
@@ -56,6 +55,8 @@ int nfsd_mountpoint(struct dentry *, struct svc_export *);
 __be32          nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
                    struct nfs4_acl *);
 int             nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
+__be32          nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
+                    struct xdr_netobj *);
 #endif /* CONFIG_NFSD_V4 */
 __be32          nfsd_create(struct svc_rqst *, struct svc_fh *,
                                char *name, int len, struct iattr *attrs,
@@ -92,17 +93,13 @@ __be32		nfsd_remove(struct svc_rqst *,
                                struct svc_fh *, char *, int);
 __be32          nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
                                char *name, int len);
-int             nfsd_truncate(struct svc_rqst *, struct svc_fh *,
-                                unsigned long size);
 __be32          nfsd_readdir(struct svc_rqst *, struct svc_fh *,
                             loff_t *, struct readdir_cd *, filldir_t);
 __be32          nfsd_statfs(struct svc_rqst *, struct svc_fh *,
                                struct kstatfs *, int access);
-int             nfsd_notify_change(struct inode *, struct iattr *);
 __be32          nfsd_permission(struct svc_rqst *, struct svc_export *,
                                struct dentry *, int);
-int             nfsd_sync_dir(struct dentry *dp);
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 3b271d2092b6..b3ed6446ed8e 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -40,6 +40,7 @@
 #include "state.h"
 #include "nfsd.h"
+#define NFSD4_MAX_SEC_LABEL_LEN 2048
 #define NFSD4_MAX_TAGLEN        128
 #define XDR_LEN(n)                     (((n) + 3) & ~3)
@@ -118,6 +119,7 @@ struct nfsd4_create {
        struct iattr    cr_iattr;           /* request */
        struct nfsd4_change_info  cr_cinfo; /* response */
        struct nfs4_acl *cr_acl;
+        struct xdr_netobj cr_label;
 };
 #define cr_linklen      u.link.namelen
 #define cr_linkname     u.link.name
@@ -246,6 +248,7 @@ struct nfsd4_open {
        struct nfs4_file *op_file;          /* used during processing */
        struct nfs4_ol_stateid *op_stp;     /* used during processing */
        struct nfs4_acl *op_acl;
+        struct xdr_netobj op_label;
 };
 #define op_iattr        iattr
@@ -330,6 +333,7 @@ struct nfsd4_setattr {
        u32             sa_bmval[3];        /* request */
        struct iattr    sa_iattr;           /* request */
        struct nfs4_acl *sa_acl;
+        struct xdr_netobj sa_label;
 };
 struct nfsd4_setclientid {
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index eed4d7b26249..741fd02e0444 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -398,6 +398,69 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
 }
 /**
+ * nilfs_palloc_count_desc_blocks - count descriptor blocks number
+ * @inode: inode of metadata file using this allocator
+ * @desc_blocks: descriptor blocks number [out]
+ */
+static int nilfs_palloc_count_desc_blocks(struct inode *inode,
+                                            unsigned long *desc_blocks)
+{
+        unsigned long blknum;
+        int ret;
+        ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
+        if (likely(!ret))
+                *desc_blocks = DIV_ROUND_UP(
+                        blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block);
+        return ret;
+}
+/**
+ * nilfs_palloc_mdt_file_can_grow - check potential opportunity for
+ *                                      MDT file growing
+ * @inode: inode of metadata file using this allocator
+ * @desc_blocks: known current descriptor blocks count
+ */
+static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode,
+                                                    unsigned long desc_blocks)
+{
+        return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) <
+                        nilfs_palloc_groups_count(inode);
+}
+/**
+ * nilfs_palloc_count_max_entries - count max number of entries that can be
+ *                                      described by descriptor blocks count
+ * @inode: inode of metadata file using this allocator
+ * @nused: current number of used entries
+ * @nmaxp: max number of entries [out]
+ */
+int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
+{
+        unsigned long desc_blocks = 0;
+        u64 entries_per_desc_block, nmax;
+        int err;
+        err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks);
+        if (unlikely(err))
+                return err;
+        entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) *
+                                nilfs_palloc_groups_per_desc_block(inode);
+        nmax = entries_per_desc_block * desc_blocks;
+        if (nused == nmax &&
+                        nilfs_palloc_mdt_file_can_grow(inode, desc_blocks))
+                nmax += entries_per_desc_block;
+        if (nused > nmax)
+                return -ERANGE;
+        *nmaxp = nmax;
+        return 0;
+}
+/**
 * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
 * @inode: inode of metadata file using this allocator
 * @req: nilfs_palloc_req structure exchanged for the allocation
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index fb7238100548..4bd6451b5703 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -48,6 +48,8 @@ int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
 void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
                                   const struct buffer_head *, void *);
+int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *);
 /**
 * nilfs_palloc_req - persistent allocator request and reply
 * @pr_entry_nr: entry number (vblocknr or inode number)
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index f30b017740a7..197a63e9d102 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -256,22 +256,18 @@ static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
        de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
-static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int nilfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        loff_t pos = filp->f_pos;
+        loff_t pos = ctx->pos;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        unsigned int offset = pos & ~PAGE_CACHE_MASK;
        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned long npages = dir_pages(inode);
 /*      unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
-        unsigned char *types = NULL;
-        int ret;
        if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
-                goto success;
+                return 0;
-        types = nilfs_filetype_table;
        for ( ; n < npages; n++, offset = 0) {
                char *kaddr, *limit;
@@ -281,9 +277,8 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                if (IS_ERR(page)) {
                        nilfs_error(sb, __func__, "bad page in #%lu",
                                    inode->i_ino);
-                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        ctx->pos += PAGE_CACHE_SIZE - offset;
-                        ret = -EIO;
+                        return -EIO;
-                        goto done;
                }
                kaddr = page_address(page);
                de = (struct nilfs_dir_entry *)(kaddr + offset);
@@ -293,35 +288,28 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        if (de->rec_len == 0) {
                                nilfs_error(sb, __func__,
                                            "zero-length directory entry");
-                                ret = -EIO;
                                nilfs_put_page(page);
-                                goto done;
+                                return -EIO;
                        }
                        if (de->inode) {
-                                int over;
+                                unsigned char t;
-                                unsigned char d_type = DT_UNKNOWN;
-                                if (types && de->file_type < NILFS_FT_MAX)
+                                if (de->file_type < NILFS_FT_MAX)
-                                        d_type = types[de->file_type];
+                                        t = nilfs_filetype_table[de->file_type];
+                                else
+                                        t = DT_UNKNOWN;
-                                offset = (char *)de - kaddr;
+                                if (!dir_emit(ctx, de->name, de->name_len,
-                                over = filldir(dirent, de->name, de->name_len,
+                                                le64_to_cpu(de->inode), t)) {
-                                                (n<<PAGE_CACHE_SHIFT) | offset,
-                                                le64_to_cpu(de->inode), d_type);
-                                if (over) {
                                        nilfs_put_page(page);
-                                        goto success;
+                                        return 0;
                                }
                        }
-                        filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
+                        ctx->pos += nilfs_rec_len_from_disk(de->rec_len);
                }
                nilfs_put_page(page);
        }
+        return 0;
-success:
-        ret = 0;
-done:
-        return ret;
 }
 /*
@@ -678,7 +666,7 @@ not_empty:
 const struct file_operations nilfs_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = nilfs_readdir,
+        .iterate        = nilfs_readdir,
        .unlocked_ioctl = nilfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = nilfs_compat_ioctl,
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index d8e65bde083c..6548c7851b48 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -160,6 +160,28 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
 }
 /**
+ * nilfs_ifile_count_free_inodes - calculate free inodes count
+ * @ifile: ifile inode
+ * @nmaxinodes: current maximum of available inodes count [out]
+ * @nfreeinodes: free inodes count [out]
+ */
+int nilfs_ifile_count_free_inodes(struct inode *ifile,
+                                    u64 *nmaxinodes, u64 *nfreeinodes)
+{
+        u64 nused;
+        int err;
+        *nmaxinodes = 0;
+        *nfreeinodes = 0;
+        nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count);
+        err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes);
+        if (likely(!err))
+                *nfreeinodes = *nmaxinodes - nused;
+        return err;
+}
+/**
 * nilfs_ifile_read - read or get ifile inode
 * @sb: super block instance
 * @root: root object
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 59b6f2b51df6..679674d13372 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
 int nilfs_ifile_delete_inode(struct inode *, ino_t);
 int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
+int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
 int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
                     size_t inode_size, struct nilfs_inode *raw_inode,
                     struct inode **inodep);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index bccfec8343c5..b1a5277cfd18 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -54,7 +54,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n)
        inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
        if (root)
-                atomic_add(n, &root->blocks_count);
+                atomic64_add(n, &root->blocks_count);
 }
 void nilfs_inode_sub_blocks(struct inode *inode, int n)
@@ -63,7 +63,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n)
        inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
        if (root)
-                atomic_sub(n, &root->blocks_count);
+                atomic64_sub(n, &root->blocks_count);
 }
 /**
@@ -369,7 +369,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
                goto failed_ifile_create_inode;
        /* reference count of i_bh inherits from nilfs_mdt_read_block() */
-        atomic_inc(&root->inodes_count);
+        atomic64_inc(&root->inodes_count);
        inode_init_owner(inode, dir, mode);
        inode->i_ino = ino;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -801,7 +801,7 @@ void nilfs_evict_inode(struct inode *inode)
        ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
        if (!ret)
-                atomic_dec(&ii->i_root->inodes_count);
+                atomic64_dec(&ii->i_root->inodes_count);
        nilfs_clear_inode(inode);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a5752a589932..bd88a7461063 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -835,9 +835,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
        raw_cp->cp_snapshot_list.ssl_next = 0;
        raw_cp->cp_snapshot_list.ssl_prev = 0;
        raw_cp->cp_inodes_count =
-                cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
+                cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
        raw_cp->cp_blocks_count =
-                cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
+                cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
        raw_cp->cp_nblk_inc =
                cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
        raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c7d1f9f18b09..af3ba0478cdf 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -554,8 +554,10 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
        if (err)
                goto failed_bh;
-        atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
+        atomic64_set(&root->inodes_count,
-        atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+                        le64_to_cpu(raw_cp->cp_inodes_count));
+        atomic64_set(&root->blocks_count,
+                        le64_to_cpu(raw_cp->cp_blocks_count));
        nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
@@ -609,6 +611,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        unsigned long overhead;
        unsigned long nrsvblocks;
        sector_t nfreeblocks;
+        u64 nmaxinodes, nfreeinodes;
        int err;
        /*
@@ -633,14 +636,34 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        if (unlikely(err))
                return err;
+        err = nilfs_ifile_count_free_inodes(root->ifile,
+                                            &nmaxinodes, &nfreeinodes);
+        if (unlikely(err)) {
+                printk(KERN_WARNING
+                        "NILFS warning: fail to count free inodes: err %d.\n",
+                        err);
+                if (err == -ERANGE) {
+                        /*
+                         * If nilfs_palloc_count_max_entries() returns
+                         * -ERANGE error code then we simply treat
+                         * curent inodes count as maximum possible and
+                         * zero as free inodes value.
+                         */
+                        nmaxinodes = atomic64_read(&root->inodes_count);
+                        nfreeinodes = 0;
+                        err = 0;
+                } else
+                        return err;
+        }
        buf->f_type = NILFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = blocks - overhead;
        buf->f_bfree = nfreeblocks;
        buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
                (buf->f_bfree - nrsvblocks) : 0;
-        buf->f_files = atomic_read(&root->inodes_count);
+        buf->f_files = nmaxinodes;
-        buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
+        buf->f_ffree = nfreeinodes;
        buf->f_namelen = NILFS_NAME_LEN;
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
@@ -973,7 +996,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
 static int nilfs_tree_was_touched(struct dentry *root_dentry)
 {
-        return root_dentry->d_count > 1;
+        return d_count(root_dentry) > 1;
 }
 /**
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 41e6a04a561f..94c451ce6d24 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -764,8 +764,8 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
        new->ifile = NULL;
        new->nilfs = nilfs;
        atomic_set(&new->count, 1);
-        atomic_set(&new->inodes_count, 0);
+        atomic64_set(&new->inodes_count, 0);
-        atomic_set(&new->blocks_count, 0);
+        atomic64_set(&new->blocks_count, 0);
        rb_link_node(&new->rb_node, parent, p);
        rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index be1267a34cea..de8cc53b4a5c 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -241,8 +241,8 @@ struct nilfs_root {
        struct the_nilfs *nilfs;
        struct inode *ifile;
-        atomic_t inodes_count;
+        atomic64_t inodes_count;
-        atomic_t blocks_count;
+        atomic64_t blocks_count;
 };
 /* Special checkpoint number */
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 2bfe6dc413a0..1fedd5f7ccc4 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -31,7 +31,6 @@ int dir_notify_enable __read_mostly = 1;
 static struct kmem_cache *dnotify_struct_cache __read_mostly;
 static struct kmem_cache *dnotify_mark_cache __read_mostly;
 static struct fsnotify_group *dnotify_group __read_mostly;
-static DEFINE_MUTEX(dnotify_mark_mutex);
 /*
 * dnotify will attach one of these to each inode (i_fsnotify_marks) which
@@ -183,7 +182,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
                return;
        dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
-        mutex_lock(&dnotify_mark_mutex);
+        mutex_lock(&dnotify_group->mark_mutex);
        spin_lock(&fsn_mark->lock);
        prev = &dn_mark->dn;
@@ -199,11 +198,12 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
        spin_unlock(&fsn_mark->lock);
-        /* nothing else could have found us thanks to the dnotify_mark_mutex */
+        /* nothing else could have found us thanks to the dnotify_groups
+           mark_mutex */
        if (dn_mark->dn == NULL)
-                fsnotify_destroy_mark(fsn_mark, dnotify_group);
+                fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
-        mutex_unlock(&dnotify_mark_mutex);
+        mutex_unlock(&dnotify_group->mark_mutex);
        fsnotify_put_mark(fsn_mark);
 }
@@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
        new_dn_mark->dn = NULL;
        /* this is needed to prevent the fcntl/close race described below */
-        mutex_lock(&dnotify_mark_mutex);
+        mutex_lock(&dnotify_group->mark_mutex);
        /* add the new_fsn_mark or find an old one. */
        fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
@@ -334,7 +334,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
                dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
                spin_lock(&fsn_mark->lock);
        } else {
-                fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
+                fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
+                                         NULL, 0);
                spin_lock(&new_fsn_mark->lock);
                fsn_mark = new_fsn_mark;
                dn_mark = new_dn_mark;
@@ -348,9 +349,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
        /* if (f != filp) means that we lost a race and another task/thread
         * actually closed the fd we are still playing with before we grabbed
-         * the dnotify_mark_mutex and fsn_mark->lock.  Since closing the fd is the
+         * the dnotify_groups mark_mutex and fsn_mark->lock.  Since closing the
-         * only time we clean up the marks we need to get our mark off
+         * fd is the only time we clean up the marks we need to get our mark
-         * the list. */
+         * off the list. */
        if (f != filp) {
                /* if we added ourselves, shoot ourselves, it's possible that
                 * the flush actually did shoot this fsn_mark.  That's fine too
@@ -385,9 +386,9 @@ out:
        spin_unlock(&fsn_mark->lock);
        if (destroy)
-                fsnotify_destroy_mark(fsn_mark, dnotify_group);
+                fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
-        mutex_unlock(&dnotify_mark_mutex);
+        mutex_unlock(&dnotify_group->mark_mutex);
        fsnotify_put_mark(fsn_mark);
 out_err:
        if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6c80083a984f..e44cb6427df3 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -122,6 +122,7 @@ static int fill_event_metadata(struct fsnotify_group *group,
        metadata->event_len = FAN_EVENT_METADATA_LEN;
        metadata->metadata_len = FAN_EVENT_METADATA_LEN;
        metadata->vers = FANOTIFY_METADATA_VERSION;
+        metadata->reserved = 0;
        metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
        metadata->pid = pid_vnr(event->tgid);
        if (unlikely(event->mask & FAN_Q_OVERFLOW))
@@ -399,9 +400,6 @@ static int fanotify_release(struct inode *ignored, struct file *file)
        wake_up(&group->fanotify_data.access_waitq);
 #endif
-        if (file->f_flags & FASYNC)
-                fsnotify_fasync(-1, file, 0);
        /* matches the fanotify_init->fsnotify_alloc_group */
        fsnotify_destroy_group(group);
@@ -526,14 +524,18 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
        __u32 removed;
        int destroy_mark;
+        mutex_lock(&group->mark_mutex);
        fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
-        if (!fsn_mark)
+        if (!fsn_mark) {
+                mutex_unlock(&group->mark_mutex);
                return -ENOENT;
+        }
        removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
                                                 &destroy_mark);
        if (destroy_mark)
-                fsnotify_destroy_mark(fsn_mark, group);
+                fsnotify_destroy_mark_locked(fsn_mark, group);
+        mutex_unlock(&group->mark_mutex);
        fsnotify_put_mark(fsn_mark);
        if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -550,14 +552,19 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
        __u32 removed;
        int destroy_mark;
+        mutex_lock(&group->mark_mutex);
        fsn_mark = fsnotify_find_inode_mark(group, inode);
-        if (!fsn_mark)
+        if (!fsn_mark) {
+                mutex_unlock(&group->mark_mutex);
                return -ENOENT;
+        }
        removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
                                                 &destroy_mark);
        if (destroy_mark)
-                fsnotify_destroy_mark(fsn_mark, group);
+                fsnotify_destroy_mark_locked(fsn_mark, group);
+        mutex_unlock(&group->mark_mutex);
        /* matches the fsnotify_find_inode_mark() */
        fsnotify_put_mark(fsn_mark);
        if (removed & inode->i_fsnotify_mask)
@@ -593,35 +600,55 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
        return mask & ~oldmask;
 }
+static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
+                                                   struct inode *inode,
+                                                   struct vfsmount *mnt)
+{
+        struct fsnotify_mark *mark;
+        int ret;
+        if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+                return ERR_PTR(-ENOSPC);
+        mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+        if (!mark)
+                return ERR_PTR(-ENOMEM);
+        fsnotify_init_mark(mark, fanotify_free_mark);
+        ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
+        if (ret) {
+                fsnotify_put_mark(mark);
+                return ERR_PTR(ret);
+        }
+        return mark;
+}
 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
                                      struct vfsmount *mnt, __u32 mask,
                                      unsigned int flags)
 {
        struct fsnotify_mark *fsn_mark;
        __u32 added;
-        int ret = 0;
+        mutex_lock(&group->mark_mutex);
        fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
        if (!fsn_mark) {
-                if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+                fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
-                        return -ENOSPC;
+                if (IS_ERR(fsn_mark)) {
+                        mutex_unlock(&group->mark_mutex);
-                fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+                        return PTR_ERR(fsn_mark);
-                if (!fsn_mark)
+                }
-                        return -ENOMEM;
-                fsnotify_init_mark(fsn_mark, fanotify_free_mark);
-                ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
-                if (ret)
-                        goto err;
        }
        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+        mutex_unlock(&group->mark_mutex);
        if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
                fsnotify_recalc_vfsmount_mask(mnt);
-err:
        fsnotify_put_mark(fsn_mark);
-        return ret;
+        return 0;
 }
 static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -630,7 +657,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 {
        struct fsnotify_mark *fsn_mark;
        __u32 added;
-        int ret = 0;
        pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -644,27 +670,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
            (atomic_read(&inode->i_writecount) > 0))
                return 0;
+        mutex_lock(&group->mark_mutex);
        fsn_mark = fsnotify_find_inode_mark(group, inode);
        if (!fsn_mark) {
-                if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+                fsn_mark = fanotify_add_new_mark(group, inode, NULL);
-                        return -ENOSPC;
+                if (IS_ERR(fsn_mark)) {
+                        mutex_unlock(&group->mark_mutex);
-                fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+                        return PTR_ERR(fsn_mark);
-                if (!fsn_mark)
+                }
-                        return -ENOMEM;
-                fsnotify_init_mark(fsn_mark, fanotify_free_mark);
-                ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
-                if (ret)
-                        goto err;
        }
        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+        mutex_unlock(&group->mark_mutex);
        if (added & ~inode->i_fsnotify_mask)
                fsnotify_recalc_inode_mask(inode);
-err:
        fsnotify_put_mark(fsn_mark);
-        return ret;
+        return 0;
 }
 /* fanotify syscalls */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 959815c1e017..60f954a891ab 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -636,7 +636,8 @@ static int inotify_new_watch(struct fsnotify_group *group,
                goto out_err;
        /* we are on the idr, now get on the inode */
-        ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
+        ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
+                                       NULL, 0);
        if (ret) {
                /* we failed to get on the inode, get off the idr */
                inotify_remove_from_idr(group, tmp_i_mark);
@@ -660,19 +661,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
 {
        int ret = 0;
-retry:
+        mutex_lock(&group->mark_mutex);
        /* try to update and existing watch with the new arg */
        ret = inotify_update_existing_watch(group, inode, arg);
        /* no mark present, try to add a new one */
        if (ret == -ENOENT)
                ret = inotify_new_watch(group, inode, arg);
-        /*
+        mutex_unlock(&group->mark_mutex);
-         * inotify_new_watch could race with another thread which did an
-         * inotify_new_watch between the update_existing and the add watch
-         * here, go back and try to update an existing mark again.
-         */
-        if (ret == -EEXIST)
-                goto retry;
        return ret;
 }
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc6b49bf7360..923fe4a5f503 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -20,28 +20,29 @@
 * fsnotify inode mark locking/lifetime/and refcnting
 *
 * REFCNT:
- * The mark->refcnt tells how many "things" in the kernel currently are
+ * The group->recnt and mark->refcnt tell how many "things" in the kernel
- * referencing this object.  The object typically will live inside the kernel
+ * currently are referencing the objects. Both kind of objects typically will
- * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
+ * live inside the kernel with a refcnt of 2, one for its creation and one for
- * which can find this object holding the appropriete locks, can take a reference
+ * the reference a group and a mark hold to each other.
- * and the object itself is guaranteed to survive until the reference is dropped.
+ * If you are holding the appropriate locks, you can take a reference and the
+ * object itself is guaranteed to survive until the reference is dropped.
 *
 * LOCKING:
- * There are 3 spinlocks involved with fsnotify inode marks and they MUST
+ * There are 3 locks involved with fsnotify inode marks and they MUST be taken
- * be taken in order as follows:
+ * in order as follows:
 *
+ * group->mark_mutex
 * mark->lock
- * group->mark_lock
 * inode->i_lock
 *
- * mark->lock protects 2 things, mark->group and mark->inode.  You must hold
+ * group->mark_mutex protects the marks_list anchored inside a given group and
- * that lock to dereference either of these things (they could be NULL even with
+ * each mark is hooked via the g_list.  It also protects the groups private
- * the lock)
+ * data (i.e group limits).
- *
- * group->mark_lock protects the marks_list anchored inside a given group
+ * mark->lock protects the marks attributes like its masks and flags.
- * and each mark is hooked via the g_list.  It also sorta protects the
+ * Furthermore it protects the access to a reference of the group that the mark
- * free_g_list, which when used is anchored by a private list on the stack of the
+ * is assigned to as well as the access to a reference of the inode/vfsmount
- * task which held the group->mark_lock.
+ * that is being watched by the mark.
 *
 * inode->i_lock protects the i_fsnotify_marks list anchored inside a
 * given inode and each mark is hooked via the i_list. (and sorta the
@@ -64,18 +65,11 @@
 * inode.  We take i_lock and walk the i_fsnotify_marks safely.  For each
 * mark on the list we take a reference (so the mark can't disappear under us).
 * We remove that mark form the inode's list of marks and we add this mark to a
- * private list anchored on the stack using i_free_list;  At this point we no
+ * private list anchored on the stack using i_free_list; we walk i_free_list
- * longer fear anything finding the mark using the inode's list of marks.
+ * and before we destroy the mark we make sure that we dont race with a
- *
+ * concurrent destroy_group by getting a ref to the marks group and taking the
- * We can safely and locklessly run the private list on the stack of everything
+ * groups mutex.
- * we just unattached from the original inode.  For each mark on the private list
- * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
- * we see the group and inode are not NULL we take those locks.  Now holding all
- * 3 locks we can completely remove the mark from other tasks finding it in the
- * future.  Remember, 10 things might already be referencing this mark, but they
- * better be holding a ref.  We drop our reference we took before we unhooked it
- * from the inode.  When the ref hits 0 we can free the mark.
- *
 * Very similarly for freeing by group, except we use free_g_list.
 *
 * This has the very interesting property of being able to run concurrently with
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f97af4..d267ea6aa1a0 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1372,7 +1372,7 @@ retry_writepage:
                 * The page may have dirty, unmapped buffers.  Make them
                 * freeable here, so the page does not leak.
                 */
-                block_invalidatepage(page, 0);
+                block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
                unlock_page(page);
                ntfs_debug("Write outside i_size - truncated?");
                return 0;
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index aa411c3f20e9..9e38dafa3bc7 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1004,13 +1004,11 @@ dir_err_out:
 /**
 * ntfs_filldir - ntfs specific filldir method
 * @vol:        current ntfs volume
- * @fpos:       position in the directory
 * @ndir:       ntfs inode of current directory
 * @ia_page:    page in which the index allocation buffer @ie is in resides
 * @ie:         current index entry
 * @name:       buffer to use for the converted name
- * @dirent:     vfs filldir callback context
+ * @actor:      what to feed the entries to
- * @filldir:    vfs filldir callback
 *
 * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
 * callback.
@@ -1024,12 +1022,12 @@ dir_err_out:
 * retake the lock if we are returning a non-zero value as ntfs_readdir()
 * would need to drop the lock immediately anyway.
 */
-static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
+static inline int ntfs_filldir(ntfs_volume *vol,
                ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
-                u8 *name, void *dirent, filldir_t filldir)
+                u8 *name, struct dir_context *actor)
 {
        unsigned long mref;
-        int name_len, rc;
+        int name_len;
        unsigned dt_type;
        FILE_NAME_TYPE_FLAGS name_type;
@@ -1068,13 +1066,14 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
        if (ia_page)
                unlock_page(ia_page);
        ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
-                        "0x%lx, DT_%s.", name, name_len, fpos, mref,
+                        "0x%lx, DT_%s.", name, name_len, actor->pos, mref,
                        dt_type == DT_DIR ? "DIR" : "REG");
-        rc = filldir(dirent, name, name_len, fpos, mref, dt_type);
+        if (!dir_emit(actor, name, name_len, mref, dt_type))
+                return 1;
        /* Relock the page but not if we are aborting ->readdir. */
-        if (!rc && ia_page)
+        if (ia_page)
                lock_page(ia_page);
-        return rc;
+        return 0;
 }
 /*
@@ -1097,11 +1096,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
 *             removes them again after the write is complete after which it 
 *             unlocks the page.
 */
-static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ntfs_readdir(struct file *file, struct dir_context *actor)
 {
        s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
-        loff_t fpos, i_size;
+        loff_t i_size;
-        struct inode *bmp_vi, *vdir = file_inode(filp);
+        struct inode *bmp_vi, *vdir = file_inode(file);
        struct super_block *sb = vdir->i_sb;
        ntfs_inode *ndir = NTFS_I(vdir);
        ntfs_volume *vol = NTFS_SB(sb);
@@ -1116,33 +1115,16 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        u8 *kaddr, *bmp, *index_end;
        ntfs_attr_search_ctx *ctx;
-        fpos = filp->f_pos;
        ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
-                        vdir->i_ino, fpos);
+                        vdir->i_ino, actor->pos);
        rc = err = 0;
        /* Are we at end of dir yet? */
        i_size = i_size_read(vdir);
-        if (fpos >= i_size + vol->mft_record_size)
+        if (actor->pos >= i_size + vol->mft_record_size)
-                goto done;
+                return 0;
        /* Emulate . and .. for all directories. */
-        if (!fpos) {
+        if (!dir_emit_dots(file, actor))
-                ntfs_debug("Calling filldir for . with len 1, fpos 0x0, "
+                return 0;
-                                "inode 0x%lx, DT_DIR.", vdir->i_ino);
-                rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR);
-                if (rc)
-                        goto done;
-                fpos++;
-        }
-        if (fpos == 1) {
-                ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, "
-                                "inode 0x%lx, DT_DIR.",
-                                (unsigned long)parent_ino(filp->f_path.dentry));
-                rc = filldir(dirent, "..", 2, fpos,
-                                parent_ino(filp->f_path.dentry), DT_DIR);
-                if (rc)
-                        goto done;
-                fpos++;
-        }
        m = NULL;
        ctx = NULL;
        /*
@@ -1155,7 +1137,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                goto err_out;
        }
        /* Are we jumping straight into the index allocation attribute? */
-        if (fpos >= vol->mft_record_size)
+        if (actor->pos >= vol->mft_record_size)
                goto skip_index_root;
        /* Get hold of the mft record for the directory. */
        m = map_mft_record(ndir);
@@ -1170,7 +1152,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                goto err_out;
        }
        /* Get the offset into the index root attribute. */
-        ir_pos = (s64)fpos;
+        ir_pos = (s64)actor->pos;
        /* Find the index root attribute in the mft record. */
        err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
                        0, ctx);
@@ -1226,10 +1208,9 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                if (ir_pos > (u8*)ie - (u8*)ir)
                        continue;
                /* Advance the position even if going to skip the entry. */
-                fpos = (u8*)ie - (u8*)ir;
+                actor->pos = (u8*)ie - (u8*)ir;
                /* Submit the name to the filldir callback. */
-                rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent,
+                rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
-                                filldir);
                if (rc) {
                        kfree(ir);
                        goto abort;
@@ -1242,12 +1223,12 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (!NInoIndexAllocPresent(ndir))
                goto EOD;
        /* Advance fpos to the beginning of the index allocation. */
-        fpos = vol->mft_record_size;
+        actor->pos = vol->mft_record_size;
 skip_index_root:
        kaddr = NULL;
        prev_ia_pos = -1LL;
        /* Get the offset into the index allocation attribute. */
-        ia_pos = (s64)fpos - vol->mft_record_size;
+        ia_pos = (s64)actor->pos - vol->mft_record_size;
        ia_mapping = vdir->i_mapping;
        ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
        bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
@@ -1409,7 +1390,7 @@ find_next_index_buffer:
                if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
                        continue;
                /* Advance the position even if going to skip the entry. */
-                fpos = (u8*)ie - (u8*)ia +
+                actor->pos = (u8*)ie - (u8*)ia +
                                (sle64_to_cpu(ia->index_block_vcn) <<
                                ndir->itype.index.vcn_size_bits) +
                                vol->mft_record_size;
@@ -1419,8 +1400,7 @@ find_next_index_buffer:
                 * before returning, unless a non-zero value is returned in
                 * which case the page is left unlocked.
                 */
-                rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent,
+                rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
-                                filldir);
                if (rc) {
                        /* @ia_page is already unlocked in this case. */
                        ntfs_unmap_page(ia_page);
@@ -1439,18 +1419,9 @@ unm_EOD:
        iput(bmp_vi);
 EOD:
        /* We are finished, set fpos to EOD. */
-        fpos = i_size + vol->mft_record_size;
+        actor->pos = i_size + vol->mft_record_size;
 abort:
        kfree(name);
-done:
-#ifdef DEBUG
-        if (!rc)
-                ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos);
-        else
-                ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.",
-                                rc, fpos);
-#endif
-        filp->f_pos = fpos;
        return 0;
 err_out:
        if (bmp_page) {
@@ -1471,7 +1442,6 @@ iput_err_out:
        if (!err)
                err = -EIO;
        ntfs_debug("Failed. Returning error code %i.", -err);
-        filp->f_pos = fpos;
        return err;
 }
@@ -1571,7 +1541,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 const struct file_operations ntfs_dir_ops = {
        .llseek         = generic_file_llseek,  /* Seek inside directory. */
        .read           = generic_read_dir,     /* Return -EISDIR. */
-        .readdir        = ntfs_readdir,         /* Read directory contents. */
+        .iterate        = ntfs_readdir,         /* Read directory contents. */
 #ifdef NTFS_RW
        .fsync          = ntfs_dir_fsync,       /* Sync a directory to disk. */
        /*.aio_fsync    = ,*/                   /* Sync all outstanding async
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b8a9d87231b1..17e6bdde96c5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5655,7 +5655,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
                                               &ref_tree, NULL);
                if (ret) {
                        mlog_errno(ret);
-                        goto out;
+                        goto bail;
                }
                ret = ocfs2_prepare_refcount_change_for_del(inode,
@@ -5666,7 +5666,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
                                                            &extra_blocks);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out;
+                        goto bail;
                }
        }
@@ -5674,7 +5674,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
                                                 extra_blocks);
        if (ret) {
                mlog_errno(ret);
-                return ret;
+                goto bail;
        }
        mutex_lock(&tl_inode->i_mutex);
@@ -5734,7 +5734,7 @@ out_commit:
        ocfs2_commit_trans(osb, handle);
 out:
        mutex_unlock(&tl_inode->i_mutex);
+bail:
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 20dfec72e903..2abf97b2a592 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -603,11 +603,12 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 * from ext3.  PageChecked() bits have been removed as OCFS2 does not
 * do journalled data.
 */
-static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
+static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-        jbd2_journal_invalidatepage(journal, page, offset);
+        jbd2_journal_invalidatepage(journal, page, offset, length);
 }
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -1756,7 +1757,7 @@ try_again:
                goto out;
        } else if (ret == 1) {
                clusters_need = wc->w_clen;
-                ret = ocfs2_refcount_cow(inode, filp, di_bh,
+                ret = ocfs2_refcount_cow(inode, di_bh,
                                         wc->w_cpos, wc->w_clen, UINT_MAX);
                if (ret) {
                        mlog_errno(ret);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 42252bf64b51..5c1c864e81cc 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -176,7 +176,7 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
        }
 }
-static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
+static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode)
 {
        int ret = -1;
@@ -500,7 +500,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
        }
        atomic_inc(&write_wc->wc_num_reqs);
-        submit_bio(WRITE, bio);
+        submit_bio(WRITE_SYNC, bio);
        status = 0;
 bail:
@@ -2271,7 +2271,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
                if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
                        continue;
-                ret = o2hb_global_hearbeat_mode_set(i);
+                ret = o2hb_global_heartbeat_mode_set(i);
                if (!ret)
                        printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
                               o2hb_heartbeat_mode_desc[i]);
@@ -2304,7 +2304,7 @@ static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
        NULL,
 };
-static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
+static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
        .show_attribute         = o2hb_heartbeat_group_show,
        .store_attribute        = o2hb_heartbeat_group_store,
 };
@@ -2316,7 +2316,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
 static struct config_item_type o2hb_heartbeat_group_type = {
        .ct_group_ops   = &o2hb_heartbeat_group_group_ops,
-        .ct_item_ops    = &o2hb_hearbeat_group_item_ops,
+        .ct_item_ops    = &o2hb_heartbeat_group_item_ops,
        .ct_attrs       = o2hb_heartbeat_group_attrs,
        .ct_owner       = THIS_MODULE,
 };
@@ -2389,6 +2389,9 @@ static int o2hb_region_pin(const char *region_uuid)
        assert_spin_locked(&o2hb_live_lock);
        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+                if (reg->hr_item_dropped)
+                        continue;
                uuid = config_item_name(&reg->hr_item);
                /* local heartbeat */
@@ -2439,6 +2442,9 @@ static void o2hb_region_unpin(const char *region_uuid)
        assert_spin_locked(&o2hb_live_lock);
        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+                if (reg->hr_item_dropped)
+                        continue;
                uuid = config_item_name(&reg->hr_item);
                if (region_uuid) {
                        if (strcmp(region_uuid, uuid))
@@ -2654,6 +2660,9 @@ int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
        p = region_uuids;
        list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+                if (reg->hr_item_dropped)
+                        continue;
                mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
                if (numregs < max_regions) {
                        memcpy(p, config_item_name(&reg->hr_item),
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index c19897d0fe14..1ec141e758d7 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -264,7 +264,7 @@ void o2quo_hb_still_up(u8 node)
 /* This is analogous to hb_up.  as a node's connection comes up we delay the
 * quorum decision until we see it heartbeating.  the hold will be droped in
 * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
- * it's already heartbeating we we might be dropping a hold that conn_up got.
+ * it's already heartbeating we might be dropping a hold that conn_up got.
 * */
 void o2quo_conn_up(u8 node)
 {
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa88bd8bcedc..d644dc611425 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -406,6 +406,9 @@ static void sc_kref_release(struct kref *kref)
        sc->sc_node = NULL;
        o2net_debug_del_sc(sc);
+        if (sc->sc_page)
+                __free_page(sc->sc_page);
        kfree(sc);
 }
@@ -630,19 +633,19 @@ static void o2net_state_change(struct sock *sk)
        state_change = sc->sc_state_change;
        switch(sk->sk_state) {
-                /* ignore connecting sockets as they make progress */
+        /* ignore connecting sockets as they make progress */
-                case TCP_SYN_SENT:
+        case TCP_SYN_SENT:
-                case TCP_SYN_RECV:
+        case TCP_SYN_RECV:
-                        break;
+                break;
-                case TCP_ESTABLISHED:
+        case TCP_ESTABLISHED:
-                        o2net_sc_queue_work(sc, &sc->sc_connect_work);
+                o2net_sc_queue_work(sc, &sc->sc_connect_work);
-                        break;
+                break;
-                default:
+        default:
-                        printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
+                printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
-                              " shutdown, state %d\n",
+                        " shutdown, state %d\n",
-                              SC_NODEF_ARGS(sc), sk->sk_state);
+                        SC_NODEF_ARGS(sc), sk->sk_state);
-                        o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+                o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
-                        break;
+                break;
        }
 out:
        read_unlock(&sk->sk_callback_lock);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f1e1aed8f638..30544ce8e9f7 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1761,11 +1761,10 @@ bail:
 static int ocfs2_dir_foreach_blk_id(struct inode *inode,
                                    u64 *f_version,
-                                    loff_t *f_pos, void *priv,
+                                    struct dir_context *ctx)
-                                    filldir_t filldir, int *filldir_err)
 {
-        int ret, i, filldir_ret;
+        int ret, i;
-        unsigned long offset = *f_pos;
+        unsigned long offset = ctx->pos;
        struct buffer_head *di_bh = NULL;
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
@@ -1781,8 +1780,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
        di = (struct ocfs2_dinode *)di_bh->b_data;
        data = &di->id2.i_data;
-        while (*f_pos < i_size_read(inode)) {
+        while (ctx->pos < i_size_read(inode)) {
-revalidate:
                /* If the dir block has changed since the last call to
                 * readdir(2), then we might be pointing to an invalid
                 * dirent right now.  Scan from the start of the block
@@ -1802,50 +1800,31 @@ revalidate:
                                        break;
                                i += le16_to_cpu(de->rec_len);
                        }
-                        *f_pos = offset = i;
+                        ctx->pos = offset = i;
                        *f_version = inode->i_version;
                }
-                de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos);
+                de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
-                if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) {
+                if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
                        /* On error, skip the f_pos to the end. */
-                        *f_pos = i_size_read(inode);
+                        ctx->pos = i_size_read(inode);
-                        goto out;
+                        break;
                }
                offset += le16_to_cpu(de->rec_len);
                if (le64_to_cpu(de->inode)) {
-                        /* We might block in the next section
-                         * if the data destination is
-                         * currently swapped out.  So, use a
-                         * version stamp to detect whether or
-                         * not the directory has been modified
-                         * during the copy operation.
-                         */
-                        u64 version = *f_version;
                        unsigned char d_type = DT_UNKNOWN;
                        if (de->file_type < OCFS2_FT_MAX)
                                d_type = ocfs2_filetype_table[de->file_type];
-                        filldir_ret = filldir(priv, de->name,
+                        if (!dir_emit(ctx, de->name, de->name_len,
-                                              de->name_len,
+                                      le64_to_cpu(de->inode), d_type))
-                                              *f_pos,
+                                goto out;
-                                              le64_to_cpu(de->inode),
-                                              d_type);
-                        if (filldir_ret) {
-                                if (filldir_err)
-                                        *filldir_err = filldir_ret;
-                                break;
-                        }
-                        if (version != *f_version)
-                                goto revalidate;
                }
-                *f_pos += le16_to_cpu(de->rec_len);
+                ctx->pos += le16_to_cpu(de->rec_len);
        }
 out:
        brelse(di_bh);
        return 0;
 }
@@ -1855,27 +1834,26 @@ out:
 */
 static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                                    u64 *f_version,
-                                    loff_t *f_pos, void *priv,
+                                    struct dir_context *ctx,
-                                    filldir_t filldir, int *filldir_err)
+                                    bool persist)
 {
-        int error = 0;
        unsigned long offset, blk, last_ra_blk = 0;
-        int i, stored;
+        int i;
        struct buffer_head * bh, * tmp;
        struct ocfs2_dir_entry * de;
        struct super_block * sb = inode->i_sb;
        unsigned int ra_sectors = 16;
+        int stored = 0;
-        stored = 0;
        bh = NULL;
-        offset = (*f_pos) & (sb->s_blocksize - 1);
+        offset = ctx->pos & (sb->s_blocksize - 1);
-        while (!error && !stored && *f_pos < i_size_read(inode)) {
+        while (ctx->pos < i_size_read(inode)) {
-                blk = (*f_pos) >> sb->s_blocksize_bits;
+                blk = ctx->pos >> sb->s_blocksize_bits;
                if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
                        /* Skip the corrupt dirblock and keep trying */
-                        *f_pos += sb->s_blocksize - offset;
+                        ctx->pos += sb->s_blocksize - offset;
                        continue;
                }
@@ -1897,7 +1875,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                        ra_sectors = 8;
                }
-revalidate:
                /* If the dir block has changed since the last call to
                 * readdir(2), then we might be pointing to an invalid
                 * dirent right now.  Scan from the start of the block
@@ -1917,93 +1894,64 @@ revalidate:
                                i += le16_to_cpu(de->rec_len);
                        }
                        offset = i;
-                        *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1))
+                        ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
                                | offset;
                        *f_version = inode->i_version;
                }
-                while (!error && *f_pos < i_size_read(inode)
+                while (ctx->pos < i_size_read(inode)
                       && offset < sb->s_blocksize) {
                        de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
                        if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
                                /* On error, skip the f_pos to the
                                   next block. */
-                                *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1;
+                                ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
                                brelse(bh);
-                                goto out;
+                                continue;
                        }
-                        offset += le16_to_cpu(de->rec_len);
                        if (le64_to_cpu(de->inode)) {
-                                /* We might block in the next section
-                                 * if the data destination is
-                                 * currently swapped out.  So, use a
-                                 * version stamp to detect whether or
-                                 * not the directory has been modified
-                                 * during the copy operation.
-                                 */
-                                unsigned long version = *f_version;
                                unsigned char d_type = DT_UNKNOWN;
                                if (de->file_type < OCFS2_FT_MAX)
                                        d_type = ocfs2_filetype_table[de->file_type];
-                                error = filldir(priv, de->name,
+                                if (!dir_emit(ctx, de->name,
                                                de->name_len,
-                                                *f_pos,
                                                le64_to_cpu(de->inode),
-                                                d_type);
+                                                d_type)) {
-                                if (error) {
+                                        brelse(bh);
-                                        if (filldir_err)
+                                        return 0;
-                                                *filldir_err = error;
-                                        break;
                                }
-                                if (version != *f_version)
+                                stored++;
-                                        goto revalidate;
-                                stored ++;
                        }
-                        *f_pos += le16_to_cpu(de->rec_len);
+                        offset += le16_to_cpu(de->rec_len);
+                        ctx->pos += le16_to_cpu(de->rec_len);
                }
                offset = 0;
                brelse(bh);
                bh = NULL;
+                if (!persist && stored)
+                        break;
        }
+        return 0;
-        stored = 0;
-out:
-        return stored;
 }
 static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
-                                 loff_t *f_pos, void *priv, filldir_t filldir,
+                                 struct dir_context *ctx,
-                                 int *filldir_err)
+                                 bool persist)
 {
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
+                return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
-                                                filldir, filldir_err);
+        return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
-        return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
-                                        filldir_err);
 }
 /*
 * This is intended to be called from inside other kernel functions,
 * so we fake some arguments.
 */
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
-                      filldir_t filldir)
 {
-        int ret = 0, filldir_err = 0;
        u64 version = inode->i_version;
+        ocfs2_dir_foreach_blk(inode, &version, ctx, true);
-        while (*f_pos < i_size_read(inode)) {
-                ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
-                                            filldir, &filldir_err);
-                if (ret || filldir_err)
-                        break;
-        }
-        if (ret > 0)
-                ret = -EIO;
        return 0;
 }
@@ -2011,15 +1959,15 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
 * ocfs2_readdir()
 *
 */
-int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int ocfs2_readdir(struct file *file, struct dir_context *ctx)
 {
        int error = 0;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        int lock_level = 0;
        trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
-        error = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
+        error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
        if (lock_level && error >= 0) {
                /* We release EX lock which used to update atime
                 * and get PR lock again to reduce contention
@@ -2035,8 +1983,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
                goto bail_nolock;
        }
-        error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
+        error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
-                                      dirent, filldir, NULL);
        ocfs2_inode_unlock(inode, lock_level);
        if (error)
@@ -2120,6 +2067,7 @@ bail:
 }
 struct ocfs2_empty_dir_priv {
+        struct dir_context ctx;
        unsigned seen_dot;
        unsigned seen_dot_dot;
        unsigned seen_other;
@@ -2204,10 +2152,9 @@ out:
 int ocfs2_empty_dir(struct inode *inode)
 {
        int ret;
-        loff_t start = 0;
+        struct ocfs2_empty_dir_priv priv = {
-        struct ocfs2_empty_dir_priv priv;
+                .ctx.actor = ocfs2_empty_dir_filldir,
+        };
-        memset(&priv, 0, sizeof(priv));
        if (ocfs2_dir_indexed(inode)) {
                ret = ocfs2_empty_dir_dx(inode, &priv);
@@ -2219,7 +2166,7 @@ int ocfs2_empty_dir(struct inode *inode)
                 */
        }
-        ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
+        ret = ocfs2_dir_foreach(inode, &priv.ctx);
        if (ret)
                mlog_errno(ret);
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index e683f3deb645..f0344b75b14d 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -92,9 +92,8 @@ int ocfs2_find_files_on_disk(const char *name,
                             struct ocfs2_dir_lookup_result *res);
 int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
                               int namelen, u64 *blkno);
-int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
+int ocfs2_readdir(struct file *file, struct dir_context *ctx);
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx);
-                      filldir_t filldir);
 int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                                 struct inode *dir,
                                 struct buffer_head *parent_fe_bh,
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 975810b98492..47e67c2d228f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -178,6 +178,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
                                     lock->ml.node);
                        }
                } else {
+                        status = DLM_NORMAL;
                        dlm_lock_get(lock);
                        list_add_tail(&lock->list, &res->blocked);
                        kick_thread = 1;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index e68588e6b1e8..773bd32bfd8c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -55,9 +55,6 @@
 static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
 static int dlm_recovery_thread(void *data);
-void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
-int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
-void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
 static int dlm_do_recovery(struct dlm_ctxt *dlm);
 static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
@@ -789,7 +786,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
                                 u8 dead_node)
 {
        struct dlm_lock_request lr;
-        enum dlm_status ret;
+        int ret;
        mlog(0, "\n");
@@ -802,7 +799,6 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
        lr.dead_node = dead_node;
        // send message
-        ret = DLM_NOLOCKMGR;
        ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
                                 &lr, sizeof(lr), request_from, NULL);
@@ -2696,6 +2692,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
                     dlm->name, br->node_idx, br->dead_node,
                     dlm->reco.dead_node, dlm->reco.new_master);
                spin_unlock(&dlm->spinlock);
+                dlm_put(dlm);
                return -EAGAIN;
        }
        spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ff54014a24ec..3261d71319ee 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -370,7 +370,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
        if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
                goto out;
-        return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
+        return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
 out:
        return status;
@@ -899,7 +899,7 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
                zero_clusters = last_cpos - zero_cpos;
        if (needs_cow) {
-                rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
+                rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
                                        zero_clusters, UINT_MAX);
                if (rc) {
                        mlog_errno(rc);
@@ -2078,7 +2078,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
        *meta_level = 1;
-        ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
+        ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
        if (ret)
                mlog_errno(ret);
 out:
@@ -2646,17 +2646,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
                goto out;
        }
-        if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-                ret = -EINVAL;
-        if (!ret && offset > inode->i_sb->s_maxbytes)
-                ret = -EINVAL;
-        if (ret)
-                goto out;
-        if (offset != file->f_pos) {
-                file->f_pos = offset;
-                file->f_version = 0;
-        }
 out:
        mutex_unlock(&inode->i_mutex);
@@ -2712,7 +2702,7 @@ const struct file_operations ocfs2_fops = {
 const struct file_operations ocfs2_dops = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = ocfs2_readdir,
+        .iterate        = ocfs2_readdir,
        .fsync          = ocfs2_sync_file,
        .release        = ocfs2_dir_release,
        .open           = ocfs2_dir_open,
@@ -2759,7 +2749,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
 const struct file_operations ocfs2_dops_no_plocks = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = ocfs2_readdir,
+        .iterate        = ocfs2_readdir,
        .fsync          = ocfs2_sync_file,
        .release        = ocfs2_dir_release,
        .open           = ocfs2_dir_open,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8eccfabcd12e..242170d83971 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1941,6 +1941,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
 }
 struct ocfs2_orphan_filldir_priv {
+        struct dir_context      ctx;
        struct inode            *head;
        struct ocfs2_super      *osb;
 };
@@ -1977,11 +1978,11 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 {
        int status;
        struct inode *orphan_dir_inode = NULL;
-        struct ocfs2_orphan_filldir_priv priv;
+        struct ocfs2_orphan_filldir_priv priv = {
-        loff_t pos = 0;
+                .ctx.actor = ocfs2_orphan_filldir,
+                .osb = osb,
-        priv.osb = osb;
+                .head = *head
-        priv.head = *head;
+        };
        orphan_dir_inode = ocfs2_get_system_file_inode(osb,
                                                       ORPHAN_DIR_SYSTEM_INODE,
@@ -1999,8 +2000,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
                goto out;
        }
-        status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv,
+        status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx);
-                                   ocfs2_orphan_filldir);
        if (status) {
                mlog_errno(status);
                goto out_cluster;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index a3385b63ff5e..0a992737dcaf 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -200,7 +200,6 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
 static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
 {
-        atomic_set(&osb->needs_checkpoint, 1);
        wake_up(&osb->checkpoint_event);
 }
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index f1fc172175b6..452068b45749 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -69,7 +69,7 @@ static int __ocfs2_move_extent(handle_t *handle,
        u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
        u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
-        ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
+        ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos,
                                               p_cpos, new_p_cpos, len);
        if (ret) {
                mlog_errno(ret);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b4a5cdf9dbc5..be3f8676a438 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -522,7 +522,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
        fe->i_last_eb_blk = 0;
        strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
-        le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+        fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
        fe->i_atime = fe->i_ctime = fe->i_mtime =
                cpu_to_le64(CURRENT_TIME.tv_sec);
        fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
@@ -773,7 +773,7 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry)
        return ret;
 }
-static inline int inode_is_unlinkable(struct inode *inode)
+static inline int ocfs2_inode_is_unlinkable(struct inode *inode)
 {
        if (S_ISDIR(inode->i_mode)) {
                if (inode->i_nlink == 2)
@@ -791,6 +791,7 @@ static int ocfs2_unlink(struct inode *dir,
 {
        int status;
        int child_locked = 0;
+        bool is_unlinkable = false;
        struct inode *inode = dentry->d_inode;
        struct inode *orphan_dir = NULL;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -865,7 +866,7 @@ static int ocfs2_unlink(struct inode *dir,
                goto leave;
        }
-        if (inode_is_unlinkable(inode)) {
+        if (ocfs2_inode_is_unlinkable(inode)) {
                status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
                                                  OCFS2_I(inode)->ip_blkno,
                                                  orphan_name, &orphan_insert);
@@ -873,6 +874,7 @@ static int ocfs2_unlink(struct inode *dir,
                        mlog_errno(status);
                        goto leave;
                }
+                is_unlinkable = true;
        }
        handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
@@ -892,15 +894,6 @@ static int ocfs2_unlink(struct inode *dir,
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        if (inode_is_unlinkable(inode)) {
-                status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
-                                          &orphan_insert, orphan_dir);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto leave;
-                }
-        }
        /* delete the name from the parent dir */
        status = ocfs2_delete_entry(handle, dir, &lookup);
        if (status < 0) {
@@ -923,6 +916,14 @@ static int ocfs2_unlink(struct inode *dir,
                mlog_errno(status);
                if (S_ISDIR(inode->i_mode))
                        inc_nlink(dir);
+                goto leave;
+        }
+        if (is_unlinkable) {
+                status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
+                                orphan_name, &orphan_insert, orphan_dir);
+                if (status < 0)
+                        mlog_errno(status);
        }
 leave:
@@ -2012,6 +2013,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                goto leave;
        }
+        /*
+         * We're going to journal the change of i_flags and i_orphaned_slot.
+         * It's safe anyway, though some callers may duplicate the journaling.
+         * Journaling within the func just make the logic look more
+         * straightforward.
+         */
+        status = ocfs2_journal_access_di(handle,
+                                         INODE_CACHE(inode),
+                                         fe_bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
        /* we're a cluster, and nlink can change on disk from
         * underneath us... */
        orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
@@ -2026,25 +2042,10 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                                   orphan_dir_bh, lookup);
        if (status < 0) {
                mlog_errno(status);
-                goto leave;
+                goto rollback;
-        }
-        /*
-         * We're going to journal the change of i_flags and i_orphaned_slot.
-         * It's safe anyway, though some callers may duplicate the journaling.
-         * Journaling within the func just make the logic look more
-         * straightforward.
-         */
-        status = ocfs2_journal_access_di(handle,
-                                         INODE_CACHE(inode),
-                                         fe_bh,
-                                         OCFS2_JOURNAL_ACCESS_WRITE);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
        }
-        le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+        fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
        OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
        /* Record which orphan dir our inode now resides
@@ -2057,11 +2058,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
                                   osb->slot_num);
+rollback:
+        if (status < 0) {
+                if (S_ISDIR(inode->i_mode))
+                        ocfs2_add_links_count(orphan_fe, -1);
+                set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+        }
 leave:
        brelse(orphan_dir_bh);
-        if (status)
-                mlog_errno(status);
        return status;
 }
@@ -2434,7 +2440,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
        }
        di = (struct ocfs2_dinode *)di_bh->b_data;
-        le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
+        di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL);
        di->i_orphaned_slot = 0;
        set_nlink(inode, 1);
        ocfs2_set_links_count(di, inode->i_nlink);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d355e6e36b36..3a903470c794 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -347,7 +347,6 @@ struct ocfs2_super
        struct task_struct *recovery_thread_task;
        int disable_recovery;
        wait_queue_head_t checkpoint_event;
-        atomic_t needs_checkpoint;
        struct ocfs2_journal *journal;
        unsigned long osb_commit_interval;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 998b17eda09d..a70d604593b6 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,7 +49,6 @@
 struct ocfs2_cow_context {
        struct inode *inode;
-        struct file *file;
        u32 cow_start;
        u32 cow_len;
        struct ocfs2_extent_tree data_et;
@@ -66,7 +65,7 @@ struct ocfs2_cow_context {
                            u32 *num_clusters,
                            unsigned int *extent_flags);
        int (*cow_duplicate_clusters)(handle_t *handle,
-                                      struct file *file,
+                                      struct inode *inode,
                                      u32 cpos, u32 old_cluster,
                                      u32 new_cluster, u32 new_len);
 };
@@ -2922,14 +2921,12 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
 }
 int ocfs2_duplicate_clusters_by_page(handle_t *handle,
-                                     struct file *file,
+                                     struct inode *inode,
                                     u32 cpos, u32 old_cluster,
                                     u32 new_cluster, u32 new_len)
 {
        int ret = 0, partial;
-        struct inode *inode = file_inode(file);
+        struct super_block *sb = inode->i_sb;
-        struct ocfs2_caching_info *ci = INODE_CACHE(inode);
-        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
        struct page *page;
        pgoff_t page_index;
@@ -2965,6 +2962,11 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                        to = map_end & (PAGE_CACHE_SIZE - 1);
                page = find_or_create_page(mapping, page_index, GFP_NOFS);
+                if (!page) {
+                        ret = -ENOMEM;
+                        mlog_errno(ret);
+                        break;
+                }
                /*
                 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@ -2973,13 +2975,6 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
                        BUG_ON(PageDirty(page));
-                if (PageReadahead(page)) {
-                        page_cache_async_readahead(mapping,
-                                                   &file->f_ra, file,
-                                                   page, page_index,
-                                                   readahead_pages);
-                }
                if (!PageUptodate(page)) {
                        ret = block_read_full_page(page, ocfs2_get_block);
                        if (ret) {
@@ -2999,7 +2994,8 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                        }
                }
-                ocfs2_map_and_dirty_page(inode, handle, from, to,
+                ocfs2_map_and_dirty_page(inode,
+                                         handle, from, to,
                                         page, 0, &new_block);
                mark_page_accessed(page);
 unlock:
@@ -3015,12 +3011,11 @@ unlock:
 }
 int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
-                                    struct file *file,
+                                    struct inode *inode,
                                    u32 cpos, u32 old_cluster,
                                    u32 new_cluster, u32 new_len)
 {
        int ret = 0;
-        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        struct ocfs2_caching_info *ci = INODE_CACHE(inode);
        int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
@@ -3145,7 +3140,7 @@ static int ocfs2_replace_clusters(handle_t *handle,
        /*If the old clusters is unwritten, no need to duplicate. */
        if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-                ret = context->cow_duplicate_clusters(handle, context->file,
+                ret = context->cow_duplicate_clusters(handle, context->inode,
                                                      cpos, old, new, len);
                if (ret) {
                        mlog_errno(ret);
@@ -3423,35 +3418,12 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
        return ret;
 }
-static void ocfs2_readahead_for_cow(struct inode *inode,
-                                    struct file *file,
-                                    u32 start, u32 len)
-{
-        struct address_space *mapping;
-        pgoff_t index;
-        unsigned long num_pages;
-        int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
-        if (!file)
-                return;
-        mapping = file->f_mapping;
-        num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
-        if (!num_pages)
-                num_pages = 1;
-        index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
-        page_cache_sync_readahead(mapping, &file->f_ra, file,
-                                  index, num_pages);
-}
 /*
 * Starting at cpos, try to CoW write_len clusters.  Don't CoW
 * past max_cpos.  This will stop when it runs into a hole or an
 * unrefcounted extent.
 */
 static int ocfs2_refcount_cow_hunk(struct inode *inode,
-                                   struct file *file,
                                   struct buffer_head *di_bh,
                                   u32 cpos, u32 write_len, u32 max_cpos)
 {
@@ -3480,8 +3452,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
        BUG_ON(cow_len == 0);
-        ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
        context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
        if (!context) {
                ret = -ENOMEM;
@@ -3503,7 +3473,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
        context->ref_root_bh = ref_root_bh;
        context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
        context->get_clusters = ocfs2_di_get_clusters;
-        context->file = file;
        ocfs2_init_dinode_extent_tree(&context->data_et,
                                      INODE_CACHE(inode), di_bh);
@@ -3532,7 +3501,6 @@ out:
 * clusters between cpos and cpos+write_len are safe to modify.
 */
 int ocfs2_refcount_cow(struct inode *inode,
-                       struct file *file,
                       struct buffer_head *di_bh,
                       u32 cpos, u32 write_len, u32 max_cpos)
 {
@@ -3552,7 +3520,7 @@ int ocfs2_refcount_cow(struct inode *inode,
                        num_clusters = write_len;
                if (ext_flags & OCFS2_EXT_REFCOUNTED) {
-                        ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
+                        ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
                                                      num_clusters, max_cpos);
                        if (ret) {
                                mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 7754608c83a4..6422bbcdb525 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -53,7 +53,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
                                          int *credits,
                                          int *ref_blocks);
 int ocfs2_refcount_cow(struct inode *inode,
-                       struct file *filep, struct buffer_head *di_bh,
+                       struct buffer_head *di_bh,
                       u32 cpos, u32 write_len, u32 max_cpos);
 typedef int (ocfs2_post_refcount_func)(struct inode *inode,
@@ -85,11 +85,11 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
                             u32 cpos, u32 write_len,
                             struct ocfs2_post_refcount *post);
 int ocfs2_duplicate_clusters_by_page(handle_t *handle,
-                                     struct file *file,
+                                     struct inode *inode,
                                     u32 cpos, u32 old_cluster,
                                     u32 new_cluster, u32 new_len);
 int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
-                                    struct file *file,
+                                    struct inode *inode,
                                    u32 cpos, u32 old_cluster,
                                    u32 new_cluster, u32 new_len);
 int ocfs2_cow_sync_writeback(struct super_block *sb,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index b7e74b580c0f..5397c07ce608 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1422,7 +1422,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
        int status;
        /* there is a really tiny chance the journal calls could fail,
         * but we wouldn't want inconsistent blocks in *any* case. */
-        u64 fe_ptr, bg_ptr, prev_bg_ptr;
+        u64 bg_ptr, prev_bg_ptr;
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
@@ -1437,51 +1437,44 @@ static int ocfs2_relink_block_group(handle_t *handle,
                (unsigned long long)le64_to_cpu(bg->bg_blkno),
                (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
-        fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
        bg_ptr = le64_to_cpu(bg->bg_next_group);
        prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
                                         prev_bg_bh,
                                         OCFS2_JOURNAL_ACCESS_WRITE);
-        if (status < 0) {
+        if (status < 0)
-                mlog_errno(status);
+                goto out;
-                goto out_rollback;
-        }
        prev_bg->bg_next_group = bg->bg_next_group;
        ocfs2_journal_dirty(handle, prev_bg_bh);
        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
                                         bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
-        if (status < 0) {
+        if (status < 0)
-                mlog_errno(status);
+                goto out_rollback_prev_bg;
-                goto out_rollback;
-        }
        bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
        ocfs2_journal_dirty(handle, bg_bh);
        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
                                         fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
-        if (status < 0) {
+        if (status < 0)
-                mlog_errno(status);
+                goto out_rollback_bg;
-                goto out_rollback;
-        }
        fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
        ocfs2_journal_dirty(handle, fe_bh);
-out_rollback:
+out:
-        if (status < 0) {
+        if (status < 0)
-                fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
-                bg->bg_next_group = cpu_to_le64(bg_ptr);
-                prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
-        }
-        if (status)
                mlog_errno(status);
        return status;
+out_rollback_bg:
+        bg->bg_next_group = cpu_to_le64(bg_ptr);
+out_rollback_prev_bg:
+        prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+        goto out;
 }
 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 01b85165552b..854d80955bf8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -286,10 +286,9 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
        spin_unlock(&osb->osb_lock);
        out += snprintf(buf + out, len - out,
-                        "%10s => Pid: %d  Interval: %lu  Needs: %d\n", "Commit",
+                        "%10s => Pid: %d  Interval: %lu\n", "Commit",
                        (osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
-                        osb->osb_commit_interval,
+                        osb->osb_commit_interval);
-                        atomic_read(&osb->needs_checkpoint));
        out += snprintf(buf + out, len - out,
                        "%10s => State: %d  TxnId: %lu  NumTxns: %d\n",
@@ -2154,7 +2153,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        init_waitqueue_head(&osb->checkpoint_event);
-        atomic_set(&osb->needs_checkpoint, 0);
        osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2e3ea308c144..317ef0abccbb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2751,7 +2751,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
 {
        int ret;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
        struct ocfs2_xa_loc loc;
        if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
@@ -2759,13 +2758,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
        down_write(&oi->ip_alloc_sem);
        if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
-                if (!ocfs2_xattr_has_space_inline(inode, di)) {
-                        ret = -ENOSPC;
-                        goto out;
-                }
-        }
-        if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
                ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
                if (ret) {
                        if (ret != -ENOSPC)
@@ -6499,6 +6491,16 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
        }
        new_oi = OCFS2_I(args->new_inode);
+        /*
+         * Adjust extent record count to reserve space for extended attribute.
+         * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
+         */
+        if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
+            !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
+                struct ocfs2_extent_list *el = &new_di->id2.i_list;
+                le16_add_cpu(&el->l_count, -(inline_size /
+                                        sizeof(struct ocfs2_extent_rec)));
+        }
        spin_lock(&new_oi->ip_lock);
        new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
        new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index acbaebcad3a8..1b8e9e8405b2 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -327,26 +327,23 @@ int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
        return is_bad;
 }
-static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
+static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx,
                u64 fsblock, int hindex)
 {
-        struct inode *dir = file_inode(filp);
-        struct buffer_head *bh;
-        struct omfs_inode *oi;
-        u64 self;
-        int res = 0;
-        unsigned char d_type;
        /* follow chain in this bucket */
        while (fsblock != ~0) {
-                bh = omfs_bread(dir->i_sb, fsblock);
+                struct buffer_head *bh = omfs_bread(dir->i_sb, fsblock);
+                struct omfs_inode *oi;
+                u64 self;
+                unsigned char d_type;
                if (!bh)
-                        goto out;
+                        return true;
                oi = (struct omfs_inode *) bh->b_data;
                if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
                        brelse(bh);
-                        goto out;
+                        return true;
                }
                self = fsblock;
@@ -361,15 +358,16 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
                d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
-                res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
+                if (!dir_emit(ctx, oi->i_name,
-                        OMFS_NAMELEN), filp->f_pos, self, d_type);
+                              strnlen(oi->i_name, OMFS_NAMELEN),
+                              self, d_type)) {
+                        brelse(bh);
+                        return false;
+                }
                brelse(bh);
-                if (res < 0)
+                ctx->pos++;
-                        break;
-                filp->f_pos++;
        }
-out:
+        return true;
-        return res;
 }
 static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -403,60 +401,44 @@ out:
        return err;
 }
-static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int omfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *dir = file_inode(filp);
+        struct inode *dir = file_inode(file);
        struct buffer_head *bh;
-        loff_t offset, res;
+        __be64 *p;
        unsigned int hchain, hindex;
        int nbuckets;
-        u64 fsblock;
-        int ret = -EINVAL;
+        if (ctx->pos >> 32)
+                return -EINVAL;
-        if (filp->f_pos >> 32)
-                goto success;
+        if (ctx->pos < 1 << 20) {
+                if (!dir_emit_dots(file, ctx))
-        switch ((unsigned long) filp->f_pos) {
+                        return 0;
-        case 0:
+                ctx->pos = 1 << 20;
-                if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
-                        goto success;
-                filp->f_pos++;
-                /* fall through */
-        case 1:
-                if (filldir(dirent, "..", 2, 1,
-                    parent_ino(filp->f_dentry), DT_DIR) < 0)
-                        goto success;
-                filp->f_pos = 1 << 20;
-                /* fall through */
        }
        nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
        /* high 12 bits store bucket + 1 and low 20 bits store hash index */
-        hchain = (filp->f_pos >> 20) - 1;
+        hchain = (ctx->pos >> 20) - 1;
-        hindex = filp->f_pos & 0xfffff;
+        hindex = ctx->pos & 0xfffff;
        bh = omfs_bread(dir->i_sb, dir->i_ino);
        if (!bh)
-                goto out;
+                return -EINVAL;
-        offset = OMFS_DIR_START + hchain * 8;
+        p = (__be64 *)(bh->b_data + OMFS_DIR_START) + hchain;
-        for (; hchain < nbuckets; hchain++, offset += 8) {
+        for (; hchain < nbuckets; hchain++) {
-                fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset]));
+                __u64 fsblock = be64_to_cpu(*p++);
+                if (!omfs_fill_chain(dir, ctx, fsblock, hindex))
-                res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex);
-                hindex = 0;
-                if (res < 0)
                        break;
+                hindex = 0;
-                filp->f_pos = (hchain+2) << 20;
+                ctx->pos = (hchain+2) << 20;
        }
        brelse(bh);
-success:
+        return 0;
-        ret = 0;
-out:
-        return ret;
 }
 const struct inode_operations omfs_dir_inops = {
@@ -470,6 +452,6 @@ const struct inode_operations omfs_dir_inops = {
 const struct file_operations omfs_dir_operations = {
        .read = generic_read_dir,
-        .readdir = omfs_readdir,
+        .iterate = omfs_readdir,
        .llseek = generic_file_llseek,
 };
diff --git a/fs/open.c b/fs/open.c
index 8c741002f947..7931f76acc2b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -823,7 +823,7 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
        int lookup_flags = 0;
        int acc_mode;
-        if (flags & O_CREAT)
+        if (flags & (O_CREAT | __O_TMPFILE))
                op->mode = (mode & S_IALLUGO) | S_IFREG;
        else
                op->mode = 0;
@@ -840,11 +840,17 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
        if (flags & __O_SYNC)
                flags |= O_DSYNC;
-        /*
+        if (flags & __O_TMPFILE) {
-         * If we have O_PATH in the open flag. Then we
+                if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
-         * cannot have anything other than the below set of flags
+                        return -EINVAL;
-         */
+                acc_mode = MAY_OPEN | ACC_MODE(flags);
-        if (flags & O_PATH) {
+                if (!(acc_mode & MAY_WRITE))
+                        return -EINVAL;
+        } else if (flags & O_PATH) {
+                /*
+                 * If we have O_PATH in the open flag. Then we
+                 * cannot have anything other than the below set of flags
+                 */
                flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
                acc_mode = 0;
        } else {
@@ -876,7 +882,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
                lookup_flags |= LOOKUP_DIRECTORY;
        if (!(flags & O_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
-        return lookup_flags;
+        op->lookup_flags = lookup_flags;
+        return 0;
 }
 /**
@@ -893,8 +900,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
 struct file *file_open_name(struct filename *name, int flags, umode_t mode)
 {
        struct open_flags op;
-        int lookup = build_open_flags(flags, mode, &op);
+        int err = build_open_flags(flags, mode, &op);
-        return do_filp_open(AT_FDCWD, name, &op, lookup);
+        return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op);
 }
 /**
@@ -919,37 +926,43 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
                            const char *filename, int flags)
 {
        struct open_flags op;
-        int lookup = build_open_flags(flags, 0, &op);
+        int err = build_open_flags(flags, 0, &op);
+        if (err)
+                return ERR_PTR(err);
        if (flags & O_CREAT)
                return ERR_PTR(-EINVAL);
        if (!filename && (flags & O_DIRECTORY))
                if (!dentry->d_inode->i_op->lookup)
                        return ERR_PTR(-ENOTDIR);
-        return do_file_open_root(dentry, mnt, filename, &op, lookup);
+        return do_file_open_root(dentry, mnt, filename, &op);
 }
 EXPORT_SYMBOL(file_open_root);
 long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
 {
        struct open_flags op;
-        int lookup = build_open_flags(flags, mode, &op);
+        int fd = build_open_flags(flags, mode, &op);
-        struct filename *tmp = getname(filename);
+        struct filename *tmp;
-        int fd = PTR_ERR(tmp);
+        if (fd)
-        if (!IS_ERR(tmp)) {
+                return fd;
-                fd = get_unused_fd_flags(flags);
-                if (fd >= 0) {
+        tmp = getname(filename);
-                        struct file *f = do_filp_open(dfd, tmp, &op, lookup);
+        if (IS_ERR(tmp))
-                        if (IS_ERR(f)) {
+                return PTR_ERR(tmp);
-                                put_unused_fd(fd);
-                                fd = PTR_ERR(f);
+        fd = get_unused_fd_flags(flags);
-                        } else {
+        if (fd >= 0) {
-                                fsnotify_open(f);
+                struct file *f = do_filp_open(dfd, tmp, &op);
-                                fd_install(fd, f);
+                if (IS_ERR(f)) {
-                        }
+                        put_unused_fd(fd);
+                        fd = PTR_ERR(f);
+                } else {
+                        fsnotify_open(f);
+                        fd_install(fd, f);
                }
-                putname(tmp);
        }
+        putname(tmp);
        return fd;
 }
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 75885ffde44e..8c0ceb8dd1f7 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -162,11 +162,11 @@ static const struct file_operations openpromfs_prop_ops = {
        .release        = seq_release,
 };
-static int openpromfs_readdir(struct file *, void *, filldir_t);
+static int openpromfs_readdir(struct file *, struct dir_context *);
 static const struct file_operations openprom_operations = {
        .read           = generic_read_dir,
-        .readdir        = openpromfs_readdir,
+        .iterate        = openpromfs_readdir,
        .llseek         = generic_file_llseek,
 };
@@ -260,71 +260,64 @@ found:
        return NULL;
 }
-static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int openpromfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct op_inode_info *oi = OP_I(inode);
        struct device_node *dp = oi->u.node;
        struct device_node *child;
        struct property *prop;
-        unsigned int ino;
        int i;
        mutex_lock(&op_mutex);
        
-        ino = inode->i_ino;
+        if (ctx->pos == 0) {
-        i = filp->f_pos;
+                if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
-        switch (i) {
-        case 0:
-                if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
                        goto out;
-                i++;
+                ctx->pos = 1;
-                filp->f_pos++;
+        }
-                /* fall thru */
+        if (ctx->pos == 1) {
-        case 1:
+                if (!dir_emit(ctx, "..", 2,
-                if (filldir(dirent, "..", 2, i,
                            (dp->parent == NULL ?
                             OPENPROM_ROOT_INO :
-                             dp->parent->unique_id), DT_DIR) < 0) 
+                             dp->parent->unique_id), DT_DIR))
                        goto out;
-                i++;
+                ctx->pos = 2;
-                filp->f_pos++;
+        }
-                /* fall thru */
+        i = ctx->pos - 2;
-        default:
-                i -= 2;
-                /* First, the children nodes as directories.  */
-                child = dp->child;
-                while (i && child) {
-                        child = child->sibling;
-                        i--;
-                }
-                while (child) {
-                        if (filldir(dirent,
-                                    child->path_component_name,
-                                    strlen(child->path_component_name),
-                                    filp->f_pos, child->unique_id, DT_DIR) < 0)
-                                goto out;
-                        filp->f_pos++;
-                        child = child->sibling;
-                }
-                /* Next, the properties as files.  */
+        /* First, the children nodes as directories.  */
-                prop = dp->properties;
+        child = dp->child;
-                while (i && prop) {
+        while (i && child) {
-                        prop = prop->next;
+                child = child->sibling;
-                        i--;
+                i--;
-                }
+        }
-                while (prop) {
+        while (child) {
-                        if (filldir(dirent, prop->name, strlen(prop->name),
+                if (!dir_emit(ctx,
-                                    filp->f_pos, prop->unique_id, DT_REG) < 0)
+                            child->path_component_name,
-                                goto out;
+                            strlen(child->path_component_name),
+                            child->unique_id, DT_DIR))
+                        goto out;
-                        filp->f_pos++;
+                ctx->pos++;
-                        prop = prop->next;
+                child = child->sibling;
-                }
+        }
+        /* Next, the properties as files.  */
+        prop = dp->properties;
+        while (i && prop) {
+                prop = prop->next;
+                i--;
        }
+        while (prop) {
+                if (!dir_emit(ctx, prop->name, strlen(prop->name),
+                            prop->unique_id, DT_REG))
+                        goto out;
+                ctx->pos++;
+                prop = prop->next;
+        }
 out:
        mutex_unlock(&op_mutex);
        return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c3834dad09b3..1485e38daaa3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1681,46 +1681,34 @@ const struct dentry_operations pid_dentry_operations =
 * reported by readdir in sync with the inode numbers reported
 * by stat.
 */
-int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+bool proc_fill_cache(struct file *file, struct dir_context *ctx,
        const char *name, int len,
        instantiate_t instantiate, struct task_struct *task, const void *ptr)
 {
-        struct dentry *child, *dir = filp->f_path.dentry;
+        struct dentry *child, *dir = file->f_path.dentry;
+        struct qstr qname = QSTR_INIT(name, len);
        struct inode *inode;
-        struct qstr qname;
+        unsigned type;
-        ino_t ino = 0;
+        ino_t ino;
-        unsigned type = DT_UNKNOWN;
-        qname.name = name;
-        qname.len  = len;
-        qname.hash = full_name_hash(name, len);
-        child = d_lookup(dir, &qname);
+        child = d_hash_and_lookup(dir, &qname);
        if (!child) {
-                struct dentry *new;
+                child = d_alloc(dir, &qname);
-                new = d_alloc(dir, &qname);
+                if (!child)
-                if (new) {
+                        goto end_instantiate;
-                        child = instantiate(dir->d_inode, new, task, ptr);
+                if (instantiate(dir->d_inode, child, task, ptr) < 0) {
-                        if (child)
+                        dput(child);
-                                dput(new);
+                        goto end_instantiate;
-                        else
-                                child = new;
                }
        }
-        if (!child || IS_ERR(child) || !child->d_inode)
-                goto end_instantiate;
        inode = child->d_inode;
-        if (inode) {
+        ino = inode->i_ino;
-                ino = inode->i_ino;
+        type = inode->i_mode >> 12;
-                type = inode->i_mode >> 12;
-        }
        dput(child);
+        return dir_emit(ctx, name, len, ino, type);
 end_instantiate:
-        if (!ino)
+        return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
-                ino = find_inode_number(dir, &qname);
-        if (!ino)
-                ino = 1;
-        return filldir(dirent, name, len, filp->f_pos, ino, type);
 }
 #ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1846,7 +1834,7 @@ struct map_files_info {
        unsigned char   name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
 };
-static struct dentry *
+static int
 proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
                           struct task_struct *task, const void *ptr)
 {
@@ -1856,7 +1844,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
        inode = proc_pid_make_inode(dir->i_sb, task);
        if (!inode)
-                return ERR_PTR(-ENOENT);
+                return -ENOENT;
        ei = PROC_I(inode);
        ei->op.proc_get_link = proc_map_files_get_link;
@@ -1873,7 +1861,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
        d_set_d_op(dentry, &tid_map_files_dentry_operations);
        d_add(dentry, inode);
-        return NULL;
+        return 0;
 }
 static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -1882,23 +1870,23 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
        unsigned long vm_start, vm_end;
        struct vm_area_struct *vma;
        struct task_struct *task;
-        struct dentry *result;
+        int result;
        struct mm_struct *mm;
-        result = ERR_PTR(-EPERM);
+        result = -EPERM;
        if (!capable(CAP_SYS_ADMIN))
                goto out;
-        result = ERR_PTR(-ENOENT);
+        result = -ENOENT;
        task = get_proc_task(dir);
        if (!task)
                goto out;
-        result = ERR_PTR(-EACCES);
+        result = -EACCES;
        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out_put_task;
-        result = ERR_PTR(-ENOENT);
+        result = -ENOENT;
        if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
                goto out_put_task;
@@ -1921,7 +1909,7 @@ out_no_vma:
 out_put_task:
        put_task_struct(task);
 out:
-        return result;
+        return ERR_PTR(result);
 }
 static const struct inode_operations proc_map_files_inode_operations = {
@@ -1931,14 +1919,15 @@ static const struct inode_operations proc_map_files_inode_operations = {
 };
 static int
-proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+proc_map_files_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
-        struct inode *inode = dentry->d_inode;
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct mm_struct *mm;
-        ino_t ino;
+        unsigned long nr_files, pos, i;
+        struct flex_array *fa = NULL;
+        struct map_files_info info;
+        struct map_files_info *p;
        int ret;
        ret = -EPERM;
@@ -1946,7 +1935,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
                goto out;
        ret = -ENOENT;
-        task = get_proc_task(inode);
+        task = get_proc_task(file_inode(file));
        if (!task)
                goto out;
@@ -1955,91 +1944,73 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
                goto out_put_task;
        ret = 0;
-        switch (filp->f_pos) {
+        if (!dir_emit_dots(file, ctx))
-        case 0:
+                goto out_put_task;
-                ino = inode->i_ino;
-                if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
-                        goto out_put_task;
-                filp->f_pos++;
-        case 1:
-                ino = parent_ino(dentry);
-                if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-                        goto out_put_task;
-                filp->f_pos++;
-        default:
-        {
-                unsigned long nr_files, pos, i;
-                struct flex_array *fa = NULL;
-                struct map_files_info info;
-                struct map_files_info *p;
-                mm = get_task_mm(task);
-                if (!mm)
-                        goto out_put_task;
-                down_read(&mm->mmap_sem);
-                nr_files = 0;
+        mm = get_task_mm(task);
+        if (!mm)
+                goto out_put_task;
+        down_read(&mm->mmap_sem);
-                /*
+        nr_files = 0;
-                 * We need two passes here:
-                 *
-                 *  1) Collect vmas of mapped files with mmap_sem taken
-                 *  2) Release mmap_sem and instantiate entries
-                 *
-                 * otherwise we get lockdep complained, since filldir()
-                 * routine might require mmap_sem taken in might_fault().
-                 */
-                for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+        /*
-                        if (vma->vm_file && ++pos > filp->f_pos)
+         * We need two passes here:
-                                nr_files++;
+         *
-                }
+         *  1) Collect vmas of mapped files with mmap_sem taken
+         *  2) Release mmap_sem and instantiate entries
+         *
+         * otherwise we get lockdep complained, since filldir()
+         * routine might require mmap_sem taken in might_fault().
+         */
-                if (nr_files) {
+        for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
-                        fa = flex_array_alloc(sizeof(info), nr_files,
+                if (vma->vm_file && ++pos > ctx->pos)
-                                                GFP_KERNEL);
+                        nr_files++;
-                        if (!fa || flex_array_prealloc(fa, 0, nr_files,
+        }
-                                                        GFP_KERNEL)) {
-                                ret = -ENOMEM;
+        if (nr_files) {
-                                if (fa)
+                fa = flex_array_alloc(sizeof(info), nr_files,
-                                        flex_array_free(fa);
+                                        GFP_KERNEL);
-                                up_read(&mm->mmap_sem);
+                if (!fa || flex_array_prealloc(fa, 0, nr_files,
-                                mmput(mm);
+                                                GFP_KERNEL)) {
-                                goto out_put_task;
+                        ret = -ENOMEM;
-                        }
+                        if (fa)
-                        for (i = 0, vma = mm->mmap, pos = 2; vma;
+                                flex_array_free(fa);
-                                        vma = vma->vm_next) {
+                        up_read(&mm->mmap_sem);
-                                if (!vma->vm_file)
+                        mmput(mm);
-                                        continue;
+                        goto out_put_task;
-                                if (++pos <= filp->f_pos)
-                                        continue;
-                                info.mode = vma->vm_file->f_mode;
-                                info.len = snprintf(info.name,
-                                                sizeof(info.name), "%lx-%lx",
-                                                vma->vm_start, vma->vm_end);
-                                if (flex_array_put(fa, i++, &info, GFP_KERNEL))
-                                        BUG();
-                        }
                }
-                up_read(&mm->mmap_sem);
+                for (i = 0, vma = mm->mmap, pos = 2; vma;
+                                vma = vma->vm_next) {
-                for (i = 0; i < nr_files; i++) {
+                        if (!vma->vm_file)
-                        p = flex_array_get(fa, i);
+                                continue;
-                        ret = proc_fill_cache(filp, dirent, filldir,
+                        if (++pos <= ctx->pos)
-                                              p->name, p->len,
+                                continue;
-                                              proc_map_files_instantiate,
-                                              task,
+                        info.mode = vma->vm_file->f_mode;
-                                              (void *)(unsigned long)p->mode);
+                        info.len = snprintf(info.name,
-                        if (ret)
+                                        sizeof(info.name), "%lx-%lx",
-                                break;
+                                        vma->vm_start, vma->vm_end);
-                        filp->f_pos++;
+                        if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+                                BUG();
                }
-                if (fa)
-                        flex_array_free(fa);
-                mmput(mm);
        }
+        up_read(&mm->mmap_sem);
+        for (i = 0; i < nr_files; i++) {
+                p = flex_array_get(fa, i);
+                if (!proc_fill_cache(file, ctx,
+                                      p->name, p->len,
+                                      proc_map_files_instantiate,
+                                      task,
+                                      (void *)(unsigned long)p->mode))
+                        break;
+                ctx->pos++;
        }
+        if (fa)
+                flex_array_free(fa);
+        mmput(mm);
 out_put_task:
        put_task_struct(task);
@@ -2049,7 +2020,7 @@ out:
 static const struct file_operations proc_map_files_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_map_files_readdir,
+        .iterate        = proc_map_files_readdir,
        .llseek         = default_llseek,
 };
@@ -2152,13 +2123,12 @@ static const struct file_operations proc_timers_operations = {
 };
 #endif /* CONFIG_CHECKPOINT_RESTORE */
-static struct dentry *proc_pident_instantiate(struct inode *dir,
+static int proc_pident_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
        const struct pid_entry *p = ptr;
        struct inode *inode;
        struct proc_inode *ei;
-        struct dentry *error = ERR_PTR(-ENOENT);
        inode = proc_pid_make_inode(dir->i_sb, task);
        if (!inode)
@@ -2177,9 +2147,9 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (pid_revalidate(dentry, 0))
-                error = NULL;
+                return 0;
 out:
-        return error;
+        return -ENOENT;
 }
 static struct dentry *proc_pident_lookup(struct inode *dir, 
@@ -2187,11 +2157,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
                                         const struct pid_entry *ents,
                                         unsigned int nents)
 {
-        struct dentry *error;
+        int error;
        struct task_struct *task = get_proc_task(dir);
        const struct pid_entry *p, *last;
-        error = ERR_PTR(-ENOENT);
+        error = -ENOENT;
        if (!task)
                goto out_no_task;
@@ -2214,70 +2184,33 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 out:
        put_task_struct(task);
 out_no_task:
-        return error;
+        return ERR_PTR(error);
-}
-static int proc_pident_fill_cache(struct file *filp, void *dirent,
-        filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
-{
-        return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
-                                proc_pident_instantiate, task, p);
 }
-static int proc_pident_readdir(struct file *filp,
+static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
-                void *dirent, filldir_t filldir,
                const struct pid_entry *ents, unsigned int nents)
 {
-        int i;
+        struct task_struct *task = get_proc_task(file_inode(file));
-        struct dentry *dentry = filp->f_path.dentry;
+        const struct pid_entry *p;
-        struct inode *inode = dentry->d_inode;
-        struct task_struct *task = get_proc_task(inode);
-        const struct pid_entry *p, *last;
-        ino_t ino;
-        int ret;
-        ret = -ENOENT;
        if (!task)
-                goto out_no_task;
+                return -ENOENT;
-        ret = 0;
+        if (!dir_emit_dots(file, ctx))
-        i = filp->f_pos;
+                goto out;
-        switch (i) {
-        case 0:
+        if (ctx->pos >= nents + 2)
-                ino = inode->i_ino;
+                goto out;
-                if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-                        goto out;
-                i++;
-                filp->f_pos++;
-                /* fall through */
-        case 1:
-                ino = parent_ino(dentry);
-                if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-                        goto out;
-                i++;
-                filp->f_pos++;
-                /* fall through */
-        default:
-                i -= 2;
-                if (i >= nents) {
-                        ret = 1;
-                        goto out;
-                }
-                p = ents + i;
-                last = &ents[nents - 1];
-                while (p <= last) {
-                        if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
-                                goto out;
-                        filp->f_pos++;
-                        p++;
-                }
-        }
-        ret = 1;
+        for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
+                if (!proc_fill_cache(file, ctx, p->name, p->len,
+                                proc_pident_instantiate, task, p))
+                        break;
+                ctx->pos++;
+        }
 out:
        put_task_struct(task);
-out_no_task:
+        return 0;
-        return ret;
 }
 #ifdef CONFIG_SECURITY
@@ -2362,16 +2295,15 @@ static const struct pid_entry attr_dir_stuff[] = {
        REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
 };
-static int proc_attr_dir_readdir(struct file * filp,
+static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
-                             void * dirent, filldir_t filldir)
 {
-        return proc_pident_readdir(filp,dirent,filldir,
+        return proc_pident_readdir(file, ctx, 
-                                   attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
+                                   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
 }
 static const struct file_operations proc_attr_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_attr_dir_readdir,
+        .iterate        = proc_attr_dir_readdir,
        .llseek         = default_llseek,
 };
@@ -2725,16 +2657,15 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
 };
-static int proc_tgid_base_readdir(struct file * filp,
+static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
-                             void * dirent, filldir_t filldir)
 {
-        return proc_pident_readdir(filp,dirent,filldir,
+        return proc_pident_readdir(file, ctx,
-                                   tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
+                                   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
 }
 static const struct file_operations proc_tgid_base_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_tgid_base_readdir,
+        .iterate        = proc_tgid_base_readdir,
        .llseek         = default_llseek,
 };
@@ -2836,11 +2767,10 @@ void proc_flush_task(struct task_struct *task)
        }
 }
-static struct dentry *proc_pid_instantiate(struct inode *dir,
+static int proc_pid_instantiate(struct inode *dir,
-                                           struct dentry * dentry,
+                                   struct dentry * dentry,
-                                           struct task_struct *task, const void *ptr)
+                                   struct task_struct *task, const void *ptr)
 {
-        struct dentry *error = ERR_PTR(-ENOENT);
        struct inode *inode;
        inode = proc_pid_make_inode(dir->i_sb, task);
@@ -2860,14 +2790,14 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (pid_revalidate(dentry, 0))
-                error = NULL;
+                return 0;
 out:
-        return error;
+        return -ENOENT;
 }
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
-        struct dentry *result = NULL;
+        int result = 0;
        struct task_struct *task;
        unsigned tgid;
        struct pid_namespace *ns;
@@ -2888,7 +2818,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign
        result = proc_pid_instantiate(dir, dentry, task, NULL);
        put_task_struct(task);
 out:
-        return result;
+        return ERR_PTR(result);
 }
 /*
@@ -2936,58 +2866,42 @@ retry:
 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
-static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-        struct tgid_iter iter)
-{
-        char name[PROC_NUMBUF];
-        int len = snprintf(name, sizeof(name), "%d", iter.tgid);
-        return proc_fill_cache(filp, dirent, filldir, name, len,
-                                proc_pid_instantiate, iter.task, NULL);
-}
-static int fake_filldir(void *buf, const char *name, int namelen,
-                        loff_t offset, u64 ino, unsigned d_type)
-{
-        return 0;
-}
 /* for the /proc/ directory itself, after non-process stuff has been done */
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 {
        struct tgid_iter iter;
-        struct pid_namespace *ns;
+        struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info;
-        filldir_t __filldir;
+        loff_t pos = ctx->pos;
-        loff_t pos = filp->f_pos;
        if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
-                goto out;
+                return 0;
        if (pos == TGID_OFFSET - 1) {
-                if (proc_fill_cache(filp, dirent, filldir, "self", 4,
+                struct inode *inode = ns->proc_self->d_inode;
-                                        NULL, NULL, NULL) < 0)
+                if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
-                        goto out;
+                        return 0;
                iter.tgid = 0;
        } else {
                iter.tgid = pos - TGID_OFFSET;
        }
        iter.task = NULL;
-        ns = filp->f_dentry->d_sb->s_fs_info;
        for (iter = next_tgid(ns, iter);
             iter.task;
             iter.tgid += 1, iter = next_tgid(ns, iter)) {
-                if (has_pid_permissions(ns, iter.task, 2))
+                char name[PROC_NUMBUF];
-                        __filldir = filldir;
+                int len;
-                else
+                if (!has_pid_permissions(ns, iter.task, 2))
-                        __filldir = fake_filldir;
+                        continue;
-                filp->f_pos = iter.tgid + TGID_OFFSET;
+                len = snprintf(name, sizeof(name), "%d", iter.tgid);
-                if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
+                ctx->pos = iter.tgid + TGID_OFFSET;
+                if (!proc_fill_cache(file, ctx, name, len,
+                                     proc_pid_instantiate, iter.task, NULL)) {
                        put_task_struct(iter.task);
-                        goto out;
+                        return 0;
                }
        }
-        filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
+        ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
-out:
        return 0;
 }
@@ -3075,11 +2989,10 @@ static const struct pid_entry tid_base_stuff[] = {
 #endif
 };
-static int proc_tid_base_readdir(struct file * filp,
+static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
-                             void * dirent, filldir_t filldir)
 {
-        return proc_pident_readdir(filp,dirent,filldir,
+        return proc_pident_readdir(file, ctx,
-                                   tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
+                                   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
 }
 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -3090,7 +3003,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
 static const struct file_operations proc_tid_base_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_tid_base_readdir,
+        .iterate        = proc_tid_base_readdir,
        .llseek         = default_llseek,
 };
@@ -3100,10 +3013,9 @@ static const struct inode_operations proc_tid_base_inode_operations = {
        .setattr        = proc_setattr,
 };
-static struct dentry *proc_task_instantiate(struct inode *dir,
+static int proc_task_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
-        struct dentry *error = ERR_PTR(-ENOENT);
        struct inode *inode;
        inode = proc_pid_make_inode(dir->i_sb, task);
@@ -3122,14 +3034,14 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (pid_revalidate(dentry, 0))
-                error = NULL;
+                return 0;
 out:
-        return error;
+        return -ENOENT;
 }
 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
-        struct dentry *result = ERR_PTR(-ENOENT);
+        int result = -ENOENT;
        struct task_struct *task;
        struct task_struct *leader = get_proc_task(dir);
        unsigned tid;
@@ -3159,7 +3071,7 @@ out_drop_task:
 out:
        put_task_struct(leader);
 out_no_task:
-        return result;
+        return ERR_PTR(result);
 }
 /*
@@ -3231,30 +3143,16 @@ static struct task_struct *next_tid(struct task_struct *start)
        return pos;
 }
-static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-        struct task_struct *task, int tid)
-{
-        char name[PROC_NUMBUF];
-        int len = snprintf(name, sizeof(name), "%d", tid);
-        return proc_fill_cache(filp, dirent, filldir, name, len,
-                                proc_task_instantiate, task, NULL);
-}
 /* for the /proc/TGID/task/ directories */
-static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
-        struct inode *inode = dentry->d_inode;
        struct task_struct *leader = NULL;
-        struct task_struct *task;
+        struct task_struct *task = get_proc_task(file_inode(file));
-        int retval = -ENOENT;
-        ino_t ino;
-        int tid;
        struct pid_namespace *ns;
+        int tid;
-        task = get_proc_task(inode);
        if (!task)
-                goto out_no_task;
+                return -ENOENT;
        rcu_read_lock();
        if (pid_alive(task)) {
                leader = task->group_leader;
@@ -3263,46 +3161,36 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
        rcu_read_unlock();
        put_task_struct(task);
        if (!leader)
-                goto out_no_task;
+                return -ENOENT;
-        retval = 0;
-        switch ((unsigned long)filp->f_pos) {
+        if (!dir_emit_dots(file, ctx))
-        case 0:
+                goto out;
-                ino = inode->i_ino;
-                if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos++;
-                /* fall through */
-        case 1:
-                ino = parent_ino(dentry);
-                if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos++;
-                /* fall through */
-        }
        /* f_version caches the tgid value that the last readdir call couldn't
         * return. lseek aka telldir automagically resets f_version to 0.
         */
-        ns = filp->f_dentry->d_sb->s_fs_info;
+        ns = file->f_dentry->d_sb->s_fs_info;
-        tid = (int)filp->f_version;
+        tid = (int)file->f_version;
-        filp->f_version = 0;
+        file->f_version = 0;
-        for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
+        for (task = first_tid(leader, tid, ctx->pos - 2, ns);
             task;
-             task = next_tid(task), filp->f_pos++) {
+             task = next_tid(task), ctx->pos++) {
+                char name[PROC_NUMBUF];
+                int len;
                tid = task_pid_nr_ns(task, ns);
-                if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
+                len = snprintf(name, sizeof(name), "%d", tid);
+                if (!proc_fill_cache(file, ctx, name, len,
+                                proc_task_instantiate, task, NULL)) {
                        /* returning this tgid failed, save it as the first
                         * pid for the next readir call */
-                        filp->f_version = (u64)tid;
+                        file->f_version = (u64)tid;
                        put_task_struct(task);
                        break;
                }
        }
 out:
        put_task_struct(leader);
-out_no_task:
+        return 0;
-        return retval;
 }
 static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -3328,6 +3216,6 @@ static const struct inode_operations proc_task_inode_operations = {
 static const struct file_operations proc_task_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_task_readdir,
+        .iterate        = proc_task_readdir,
        .llseek         = default_llseek,
 };
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d7a4a28ef630..75f2890abbd8 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -167,11 +167,10 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
        return ret;
 }
-static struct dentry *
+static int
 proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
                    struct task_struct *task, const void *ptr)
 {
-        struct dentry *error = ERR_PTR(-ENOENT);
        unsigned fd = (unsigned long)ptr;
        struct proc_inode *ei;
        struct inode *inode;
@@ -194,9 +193,9 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
        /* Close the race of the process dying before we return the dentry */
        if (tid_fd_revalidate(dentry, 0))
-                error = NULL;
+                return 0;
 out:
-        return error;
+        return -ENOENT;
 }
 static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -204,7 +203,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
                                           instantiate_t instantiate)
 {
        struct task_struct *task = get_proc_task(dir);
-        struct dentry *result = ERR_PTR(-ENOENT);
+        int result = -ENOENT;
        unsigned fd = name_to_int(dentry);
        if (!task)
@@ -216,77 +215,61 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
 out:
        put_task_struct(task);
 out_no_task:
-        return result;
+        return ERR_PTR(result);
 }
-static int proc_readfd_common(struct file * filp, void * dirent,
+static int proc_readfd_common(struct file *file, struct dir_context *ctx,
-                              filldir_t filldir, instantiate_t instantiate)
+                              instantiate_t instantiate)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct task_struct *p = get_proc_task(file_inode(file));
-        struct inode *inode = dentry->d_inode;
-        struct task_struct *p = get_proc_task(inode);
        struct files_struct *files;
-        unsigned int fd, ino;
+        unsigned int fd;
-        int retval;
-        retval = -ENOENT;
        if (!p)
-                goto out_no_task;
+                return -ENOENT;
-        retval = 0;
-        fd = filp->f_pos;
-        switch (fd) {
-                case 0:
-                        if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-                                goto out;
-                        filp->f_pos++;
-                case 1:
-                        ino = parent_ino(dentry);
-                        if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-                                goto out;
-                        filp->f_pos++;
-                default:
-                        files = get_files_struct(p);
-                        if (!files)
-                                goto out;
-                        rcu_read_lock();
-                        for (fd = filp->f_pos - 2;
-                             fd < files_fdtable(files)->max_fds;
-                             fd++, filp->f_pos++) {
-                                char name[PROC_NUMBUF];
-                                int len;
-                                int rv;
-                                if (!fcheck_files(files, fd))
-                                        continue;
-                                rcu_read_unlock();
-                                len = snprintf(name, sizeof(name), "%d", fd);
+        if (!dir_emit_dots(file, ctx))
-                                rv = proc_fill_cache(filp, dirent, filldir,
+                goto out;
-                                                     name, len, instantiate, p,
+        if (!dir_emit_dots(file, ctx))
-                                                     (void *)(unsigned long)fd);
+                goto out;
-                                if (rv < 0)
+        files = get_files_struct(p);
-                                        goto out_fd_loop;
+        if (!files)
-                                rcu_read_lock();
+                goto out;
-                        }
-                        rcu_read_unlock();
+        rcu_read_lock();
-out_fd_loop:
+        for (fd = ctx->pos - 2;
-                        put_files_struct(files);
+             fd < files_fdtable(files)->max_fds;
+             fd++, ctx->pos++) {
+                char name[PROC_NUMBUF];
+                int len;
+                if (!fcheck_files(files, fd))
+                        continue;
+                rcu_read_unlock();
+                len = snprintf(name, sizeof(name), "%d", fd);
+                if (!proc_fill_cache(file, ctx,
+                                     name, len, instantiate, p,
+                                     (void *)(unsigned long)fd))
+                        goto out_fd_loop;
+                rcu_read_lock();
        }
+        rcu_read_unlock();
+out_fd_loop:
+        put_files_struct(files);
 out:
        put_task_struct(p);
-out_no_task:
+        return 0;
-        return retval;
 }
-static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_readfd(struct file *file, struct dir_context *ctx)
 {
-        return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
+        return proc_readfd_common(file, ctx, proc_fd_instantiate);
 }
 const struct file_operations proc_fd_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_readfd,
+        .iterate        = proc_readfd,
        .llseek         = default_llseek,
 };
@@ -316,11 +299,10 @@ const struct inode_operations proc_fd_inode_operations = {
        .setattr        = proc_setattr,
 };
-static struct dentry *
+static int
 proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
                        struct task_struct *task, const void *ptr)
 {
-        struct dentry *error = ERR_PTR(-ENOENT);
        unsigned fd = (unsigned long)ptr;
        struct proc_inode *ei;
        struct inode *inode;
@@ -340,9 +322,9 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
        /* Close the race of the process dying before we return the dentry */
        if (tid_fd_revalidate(dentry, 0))
-                error = NULL;
+                return 0;
 out:
-        return error;
+        return -ENOENT;
 }
 static struct dentry *
@@ -351,9 +333,9 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
        return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
 }
-static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
 {
-        return proc_readfd_common(filp, dirent, filldir,
+        return proc_readfd_common(file, ctx,
                                  proc_fdinfo_instantiate);
 }
@@ -364,6 +346,6 @@ const struct inode_operations proc_fdinfo_inode_operations = {
 const struct file_operations proc_fdinfo_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_readfdinfo,
+        .iterate        = proc_readfdinfo,
        .llseek         = default_llseek,
 };
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a2596afffae6..94441a407337 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -233,76 +233,52 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
 * value of the readdir() call, as long as it's non-negative
 * for success..
 */
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
+int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
-                filldir_t filldir)
+                    struct dir_context *ctx)
 {
-        unsigned int ino;
        int i;
-        struct inode *inode = file_inode(filp);
-        int ret = 0;
-        ino = inode->i_ino;
-        i = filp->f_pos;
-        switch (i) {
-                case 0:
-                        if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-                                goto out;
-                        i++;
-                        filp->f_pos++;
-                        /* fall through */
-                case 1:
-                        if (filldir(dirent, "..", 2, i,
-                                    parent_ino(filp->f_path.dentry),
-                                    DT_DIR) < 0)
-                                goto out;
-                        i++;
-                        filp->f_pos++;
-                        /* fall through */
-                default:
-                        spin_lock(&proc_subdir_lock);
-                        de = de->subdir;
-                        i -= 2;
-                        for (;;) {
-                                if (!de) {
-                                        ret = 1;
-                                        spin_unlock(&proc_subdir_lock);
-                                        goto out;
-                                }
-                                if (!i)
-                                        break;
-                                de = de->next;
-                                i--;
-                        }
-                        do {
+        if (!dir_emit_dots(file, ctx))
-                                struct proc_dir_entry *next;
+                return 0;
-                                /* filldir passes info to user space */
+        spin_lock(&proc_subdir_lock);
-                                pde_get(de);
+        de = de->subdir;
-                                spin_unlock(&proc_subdir_lock);
+        i = ctx->pos - 2;
-                                if (filldir(dirent, de->name, de->namelen, filp->f_pos,
+        for (;;) {
-                                            de->low_ino, de->mode >> 12) < 0) {
+                if (!de) {
-                                        pde_put(de);
-                                        goto out;
-                                }
-                                spin_lock(&proc_subdir_lock);
-                                filp->f_pos++;
-                                next = de->next;
-                                pde_put(de);
-                                de = next;
-                        } while (de);
                        spin_unlock(&proc_subdir_lock);
+                        return 0;
+                }
+                if (!i)
+                        break;
+                de = de->next;
+                i--;
        }
-        ret = 1;
-out:
+        do {
-        return ret;     
+                struct proc_dir_entry *next;
+                pde_get(de);
+                spin_unlock(&proc_subdir_lock);
+                if (!dir_emit(ctx, de->name, de->namelen,
+                            de->low_ino, de->mode >> 12)) {
+                        pde_put(de);
+                        return 0;
+                }
+                spin_lock(&proc_subdir_lock);
+                ctx->pos++;
+                next = de->next;
+                pde_put(de);
+                de = next;
+        } while (de);
+        spin_unlock(&proc_subdir_lock);
+        return 0;
 }
-int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int proc_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
-        return proc_readdir_de(PDE(inode), filp, dirent, filldir);
+        return proc_readdir_de(PDE(inode), file, ctx);
 }
 /*
@@ -313,7 +289,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
 static const struct file_operations proc_dir_operations = {
        .llseek                 = generic_file_llseek,
        .read                   = generic_read_dir,
-        .readdir                = proc_readdir,
+        .iterate                = proc_readdir,
 };
 /*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d600fb098b6a..651d09a11dde 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -165,14 +165,14 @@ extern int proc_setattr(struct dentry *, struct iattr *);
 extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
 extern int pid_revalidate(struct dentry *, unsigned int);
 extern int pid_delete_dentry(const struct dentry *);
-extern int proc_pid_readdir(struct file *, void *, filldir_t);
+extern int proc_pid_readdir(struct file *, struct dir_context *);
 extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
 extern loff_t mem_lseek(struct file *, loff_t, int);
 /* Lookups */
-typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
+typedef int instantiate_t(struct inode *, struct dentry *,
                                     struct task_struct *, const void *);
-extern int proc_fill_cache(struct file *, void *, filldir_t, const char *, int,
+extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
                           instantiate_t, struct task_struct *, const void *);
 /*
@@ -183,8 +183,8 @@ extern spinlock_t proc_subdir_lock;
 extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
 extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
                                     struct dentry *);
-extern int proc_readdir(struct file *, void *, filldir_t);
+extern int proc_readdir(struct file *, struct dir_context *);
-extern int proc_readdir_de(struct proc_dir_entry *, struct file *, void *, filldir_t);
+extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
 static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
 {
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 0a22194e5d58..06ea155e1a59 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -408,7 +408,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
        prpsinfo.pr_zomb        = 0;
        strcpy(prpsinfo.pr_fname, "vmlinux");
-        strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ);
+        strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
        nhdr->p_filesz  += notesize(&notes[1]);
        bufp = storenote(&notes[1], bufp);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 54bdc6701e9f..49a7fff2e83a 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -187,13 +187,12 @@ static const struct inode_operations proc_ns_link_inode_operations = {
        .setattr        = proc_setattr,
 };
-static struct dentry *proc_ns_instantiate(struct inode *dir,
+static int proc_ns_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
        const struct proc_ns_operations *ns_ops = ptr;
        struct inode *inode;
        struct proc_inode *ei;
-        struct dentry *error = ERR_PTR(-ENOENT);
        inode = proc_pid_make_inode(dir->i_sb, task);
        if (!inode)
@@ -208,90 +207,52 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (pid_revalidate(dentry, 0))
-                error = NULL;
+                return 0;
 out:
-        return error;
+        return -ENOENT;
-}
-static int proc_ns_fill_cache(struct file *filp, void *dirent,
-        filldir_t filldir, struct task_struct *task,
-        const struct proc_ns_operations *ops)
-{
-        return proc_fill_cache(filp, dirent, filldir,
-                                ops->name, strlen(ops->name),
-                                proc_ns_instantiate, task, ops);
 }
-static int proc_ns_dir_readdir(struct file *filp, void *dirent,
+static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
-                                filldir_t filldir)
 {
-        int i;
+        struct task_struct *task = get_proc_task(file_inode(file));
-        struct dentry *dentry = filp->f_path.dentry;
-        struct inode *inode = dentry->d_inode;
-        struct task_struct *task = get_proc_task(inode);
        const struct proc_ns_operations **entry, **last;
-        ino_t ino;
-        int ret;
-        ret = -ENOENT;
        if (!task)
-                goto out_no_task;
+                return -ENOENT;
-        ret = 0;
+        if (!dir_emit_dots(file, ctx))
-        i = filp->f_pos;
+                goto out;
-        switch (i) {
+        if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
-        case 0:
+                goto out;
-                ino = inode->i_ino;
+        entry = ns_entries + (ctx->pos - 2);
-                if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+        last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
-                        goto out;
+        while (entry <= last) {
-                i++;
+                const struct proc_ns_operations *ops = *entry;
-                filp->f_pos++;
+                if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
-                /* fall through */
+                                     proc_ns_instantiate, task, ops))
-        case 1:
+                        break;
-                ino = parent_ino(dentry);
+                ctx->pos++;
-                if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+                entry++;
-                        goto out;
-                i++;
-                filp->f_pos++;
-                /* fall through */
-        default:
-                i -= 2;
-                if (i >= ARRAY_SIZE(ns_entries)) {
-                        ret = 1;
-                        goto out;
-                }
-                entry = ns_entries + i;
-                last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
-                while (entry <= last) {
-                        if (proc_ns_fill_cache(filp, dirent, filldir,
-                                                task, *entry) < 0)
-                                goto out;
-                        filp->f_pos++;
-                        entry++;
-                }
        }
-        ret = 1;
 out:
        put_task_struct(task);
-out_no_task:
+        return 0;
-        return ret;
 }
 const struct file_operations proc_ns_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_ns_dir_readdir,
+        .iterate        = proc_ns_dir_readdir,
 };
 static struct dentry *proc_ns_dir_lookup(struct inode *dir,
                                struct dentry *dentry, unsigned int flags)
 {
-        struct dentry *error;
+        int error;
        struct task_struct *task = get_proc_task(dir);
        const struct proc_ns_operations **entry, **last;
        unsigned int len = dentry->d_name.len;
-        error = ERR_PTR(-ENOENT);
+        error = -ENOENT;
        if (!task)
                goto out_no_task;
@@ -310,7 +271,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
 out:
        put_task_struct(task);
 out_no_task:
-        return error;
+        return ERR_PTR(error);
 }
 const struct inode_operations proc_ns_dir_inode_operations = {
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 986e83220d56..4677bb7dc7c2 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -160,16 +160,15 @@ const struct inode_operations proc_net_inode_operations = {
        .getattr        = proc_tgid_net_getattr,
 };
-static int proc_tgid_net_readdir(struct file *filp, void *dirent,
+static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
-                filldir_t filldir)
 {
        int ret;
        struct net *net;
        ret = -EINVAL;
-        net = get_proc_task_net(file_inode(filp));
+        net = get_proc_task_net(file_inode(file));
        if (net != NULL) {
-                ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
+                ret = proc_readdir_de(net->proc_net, file, ctx);
                put_net(net);
        }
        return ret;
@@ -178,7 +177,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
 const struct file_operations proc_net_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = proc_tgid_net_readdir,
+        .iterate        = proc_tgid_net_readdir,
 };
 static __net_init int proc_net_ns_init(struct net *net)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ac05f33a0dde..71290463a1d3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -573,12 +573,12 @@ out:
        return ret;
 }
-static int proc_sys_fill_cache(struct file *filp, void *dirent,
+static bool proc_sys_fill_cache(struct file *file,
-                                filldir_t filldir,
+                                struct dir_context *ctx,
                                struct ctl_table_header *head,
                                struct ctl_table *table)
 {
-        struct dentry *child, *dir = filp->f_path.dentry;
+        struct dentry *child, *dir = file->f_path.dentry;
        struct inode *inode;
        struct qstr qname;
        ino_t ino = 0;
@@ -595,38 +595,38 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
                        inode = proc_sys_make_inode(dir->d_sb, head, table);
                        if (!inode) {
                                dput(child);
-                                return -ENOMEM;
+                                return false;
                        } else {
                                d_set_d_op(child, &proc_sys_dentry_operations);
                                d_add(child, inode);
                        }
                } else {
-                        return -ENOMEM;
+                        return false;
                }
        }
        inode = child->d_inode;
        ino  = inode->i_ino;
        type = inode->i_mode >> 12;
        dput(child);
-        return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+        return dir_emit(ctx, qname.name, qname.len, ino, type);
 }
-static int proc_sys_link_fill_cache(struct file *filp, void *dirent,
+static bool proc_sys_link_fill_cache(struct file *file,
-                                    filldir_t filldir,
+                                    struct dir_context *ctx,
                                    struct ctl_table_header *head,
                                    struct ctl_table *table)
 {
-        int err, ret = 0;
+        bool ret = true;
        head = sysctl_head_grab(head);
        if (S_ISLNK(table->mode)) {
                /* It is not an error if we can not follow the link ignore it */
-                err = sysctl_follow_link(&head, &table, current->nsproxy);
+                int err = sysctl_follow_link(&head, &table, current->nsproxy);
                if (err)
                        goto out;
        }
-        ret = proc_sys_fill_cache(filp, dirent, filldir, head, table);
+        ret = proc_sys_fill_cache(file, ctx, head, table);
 out:
        sysctl_head_finish(head);
        return ret;
@@ -634,67 +634,50 @@ out:
 static int scan(struct ctl_table_header *head, ctl_table *table,
                unsigned long *pos, struct file *file,
-                void *dirent, filldir_t filldir)
+                struct dir_context *ctx)
 {
-        int res;
+        bool res;
-        if ((*pos)++ < file->f_pos)
+        if ((*pos)++ < ctx->pos)
-                return 0;
+                return true;
        if (unlikely(S_ISLNK(table->mode)))
-                res = proc_sys_link_fill_cache(file, dirent, filldir, head, table);
+                res = proc_sys_link_fill_cache(file, ctx, head, table);
        else
-                res = proc_sys_fill_cache(file, dirent, filldir, head, table);
+                res = proc_sys_fill_cache(file, ctx, head, table);
-        if (res == 0)
+        if (res)
-                file->f_pos = *pos;
+                ctx->pos = *pos;
        return res;
 }
-static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct ctl_table_header *head = grab_header(file_inode(file));
-        struct inode *inode = dentry->d_inode;
-        struct ctl_table_header *head = grab_header(inode);
        struct ctl_table_header *h = NULL;
        struct ctl_table *entry;
        struct ctl_dir *ctl_dir;
        unsigned long pos;
-        int ret = -EINVAL;
        if (IS_ERR(head))
                return PTR_ERR(head);
        ctl_dir = container_of(head, struct ctl_dir, header);
-        ret = 0;
+        if (!dir_emit_dots(file, ctx))
-        /* Avoid a switch here: arm builds fail with missing __cmpdi2 */
+                return 0;
-        if (filp->f_pos == 0) {
-                if (filldir(dirent, ".", 1, filp->f_pos,
-                                inode->i_ino, DT_DIR) < 0)
-                        goto out;
-                filp->f_pos++;
-        }
-        if (filp->f_pos == 1) {
-                if (filldir(dirent, "..", 2, filp->f_pos,
-                                parent_ino(dentry), DT_DIR) < 0)
-                        goto out;
-                filp->f_pos++;
-        }
        pos = 2;
        for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
-                ret = scan(h, entry, &pos, filp, dirent, filldir);
+                if (!scan(h, entry, &pos, file, ctx)) {
-                if (ret) {
                        sysctl_head_finish(h);
                        break;
                }
        }
-        ret = 1;
-out:
        sysctl_head_finish(head);
-        return ret;
+        return 0;
 }
 static int proc_sys_permission(struct inode *inode, int mask)
@@ -769,7 +752,7 @@ static const struct file_operations proc_sys_file_operations = {
 static const struct file_operations proc_sys_dir_file_operations = {
        .read           = generic_read_dir,
-        .readdir        = proc_sys_readdir,
+        .iterate        = proc_sys_readdir,
        .llseek         = generic_file_llseek,
 };
@@ -813,15 +796,16 @@ static int sysctl_is_seen(struct ctl_table_header *p)
        return res;
 }
-static int proc_sys_compare(const struct dentry *parent,
+static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry,
-                const struct inode *pinode,
-                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
        struct ctl_table_header *head;
+        struct inode *inode;
        /* Although proc doesn't have negative dentries, rcu-walk means
         * that inode here can be NULL */
        /* AV: can it, indeed? */
+        inode = ACCESS_ONCE(dentry->d_inode);
        if (!inode)
                return 1;
        if (name->len != len)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 41a6ea93f486..229e366598da 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -202,21 +202,14 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr
        return proc_pid_lookup(dir, dentry, flags);
 }
-static int proc_root_readdir(struct file * filp,
+static int proc_root_readdir(struct file *file, struct dir_context *ctx)
-        void * dirent, filldir_t filldir)
 {
-        unsigned int nr = filp->f_pos;
+        if (ctx->pos < FIRST_PROCESS_ENTRY) {
-        int ret;
+                proc_readdir(file, ctx);
+                ctx->pos = FIRST_PROCESS_ENTRY;
-        if (nr < FIRST_PROCESS_ENTRY) {
-                int error = proc_readdir(filp, dirent, filldir);
-                if (error <= 0)
-                        return error;
-                filp->f_pos = FIRST_PROCESS_ENTRY;
        }
-        ret = proc_pid_readdir(filp, dirent, filldir);
+        return proc_pid_readdir(file, ctx);
-        return ret;
 }
 /*
@@ -226,7 +219,7 @@ static int proc_root_readdir(struct file * filp,
 */
 static const struct file_operations proc_root_operations = {
        .read            = generic_read_dir,
-        .readdir         = proc_root_readdir,
+        .iterate         = proc_root_readdir,
        .llseek         = default_llseek,
 };
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d864d56..107d026f5d6e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -688,10 +689,66 @@ const struct file_operations proc_tid_smaps_operations = {
        .release        = seq_release_private,
 };
+/*
+ * We do not want to have constant page-shift bits sitting in
+ * pagemap entries and are about to reuse them some time soon.
+ *
+ * Here's the "migration strategy":
+ * 1. when the system boots these bits remain what they are,
+ *    but a warning about future change is printed in log;
+ * 2. once anyone clears soft-dirty bits via clear_refs file,
+ *    these flag is set to denote, that user is aware of the
+ *    new API and those page-shift bits change their meaning.
+ *    The respective warning is printed in dmesg;
+ * 3. In a couple of releases we will remove all the mentions
+ *    of page-shift in pagemap entries.
+ */
+static bool soft_dirty_cleared __read_mostly;
+enum clear_refs_types {
+        CLEAR_REFS_ALL = 1,
+        CLEAR_REFS_ANON,
+        CLEAR_REFS_MAPPED,
+        CLEAR_REFS_SOFT_DIRTY,
+        CLEAR_REFS_LAST,
+};
+struct clear_refs_private {
+        struct vm_area_struct *vma;
+        enum clear_refs_types type;
+};
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+                unsigned long addr, pte_t *pte)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+        /*
+         * The soft-dirty tracker uses #PF-s to catch writes
+         * to pages, so write-protect the pte as well. See the
+         * Documentation/vm/soft-dirty.txt for full description
+         * of how soft-dirty works.
+         */
+        pte_t ptent = *pte;
+        if (pte_present(ptent)) {
+                ptent = pte_wrprotect(ptent);
+                ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+        } else if (is_swap_pte(ptent)) {
+                ptent = pte_swp_clear_soft_dirty(ptent);
+        } else if (pte_file(ptent)) {
+                ptent = pte_file_clear_soft_dirty(ptent);
+        }
+        set_pte_at(vma->vm_mm, addr, pte, ptent);
+#endif
+}
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
                                unsigned long end, struct mm_walk *walk)
 {
-        struct vm_area_struct *vma = walk->private;
+        struct clear_refs_private *cp = walk->private;
+        struct vm_area_struct *vma = cp->vma;
        pte_t *pte, ptent;
        spinlock_t *ptl;
        struct page *page;
@@ -703,6 +760,12 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE) {
                ptent = *pte;
+                if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+                        clear_soft_dirty(vma, addr, pte);
+                        continue;
+                }
                if (!pte_present(ptent))
                        continue;
@@ -719,10 +782,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        return 0;
 }
-#define CLEAR_REFS_ALL 1
-#define CLEAR_REFS_ANON 2
-#define CLEAR_REFS_MAPPED 3
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
 {
@@ -730,7 +789,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
        char buffer[PROC_NUMBUF];
        struct mm_struct *mm;
        struct vm_area_struct *vma;
-        int type;
+        enum clear_refs_types type;
+        int itype;
        int rv;
        memset(buffer, 0, sizeof(buffer));
@@ -738,23 +798,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
-        rv = kstrtoint(strstrip(buffer), 10, &type);
+        rv = kstrtoint(strstrip(buffer), 10, &itype);
        if (rv < 0)
                return rv;
-        if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
+        type = (enum clear_refs_types)itype;
+        if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
                return -EINVAL;
+        if (type == CLEAR_REFS_SOFT_DIRTY) {
+                soft_dirty_cleared = true;
+                pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
+                                "See the linux/Documentation/vm/pagemap.txt for details.\n");
+        }
        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
        mm = get_task_mm(task);
        if (mm) {
+                struct clear_refs_private cp = {
+                        .type = type,
+                };
                struct mm_walk clear_refs_walk = {
                        .pmd_entry = clear_refs_pte_range,
                        .mm = mm,
+                        .private = &cp,
                };
                down_read(&mm->mmap_sem);
+                if (type == CLEAR_REFS_SOFT_DIRTY)
+                        mmu_notifier_invalidate_range_start(mm, 0, -1);
                for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                        clear_refs_walk.private = vma;
+                        cp.vma = vma;
                        if (is_vm_hugetlb_page(vma))
                                continue;
                        /*
@@ -773,6 +847,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                        walk_page_range(vma->vm_start, vma->vm_end,
                                        &clear_refs_walk);
                }
+                if (type == CLEAR_REFS_SOFT_DIRTY)
+                        mmu_notifier_invalidate_range_end(mm, 0, -1);
                flush_tlb_mm(mm);
                up_read(&mm->mmap_sem);
                mmput(mm);
@@ -792,14 +868,15 @@ typedef struct {
 } pagemap_entry_t;
 struct pagemapread {
-        int pos, len;
+        int pos, len;           /* units: PM_ENTRY_BYTES, not bytes */
        pagemap_entry_t *buffer;
+        bool v2;
 };
 #define PAGEMAP_WALK_SIZE       (PMD_SIZE)
 #define PAGEMAP_WALK_MASK       (PMD_MASK)
-#define PM_ENTRY_BYTES      sizeof(u64)
+#define PM_ENTRY_BYTES      sizeof(pagemap_entry_t)
 #define PM_STATUS_BITS      3
 #define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
 #define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
@@ -807,14 +884,17 @@ struct pagemapread {
 #define PM_PSHIFT_BITS      6
 #define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
 #define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define PM_PSHIFT(x)        (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define __PM_PSHIFT(x)      (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
 #define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
 #define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
+/* in "new" pagemap pshift bits are occupied with more status bits */
+#define PM_STATUS2(v2, x)   (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
+#define __PM_SOFT_DIRTY      (1LL)
 #define PM_PRESENT          PM_STATUS(4LL)
 #define PM_SWAP             PM_STATUS(2LL)
 #define PM_FILE             PM_STATUS(1LL)
-#define PM_NOT_PRESENT      PM_PSHIFT(PAGE_SHIFT)
+#define PM_NOT_PRESENT(v2)  PM_STATUS2(v2, 0)
 #define PM_END_OF_BUFFER    1
 static inline pagemap_entry_t make_pme(u64 val)
@@ -837,7 +917,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
        struct pagemapread *pm = walk->private;
        unsigned long addr;
        int err = 0;
-        pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+        pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
        for (addr = start; addr < end; addr += PAGE_SIZE) {
                err = add_to_pagemap(addr, &pme, pm);
@@ -847,38 +927,43 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
        return err;
 }
-static void pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
                struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
        u64 frame, flags;
        struct page *page = NULL;
+        int flags2 = 0;
        if (pte_present(pte)) {
                frame = pte_pfn(pte);
                flags = PM_PRESENT;
                page = vm_normal_page(vma, addr, pte);
        } else if (is_swap_pte(pte)) {
-                swp_entry_t entry = pte_to_swp_entry(pte);
+                swp_entry_t entry;
+                if (pte_swp_soft_dirty(pte))
+                        flags2 |= __PM_SOFT_DIRTY;
+                entry = pte_to_swp_entry(pte);
                frame = swp_type(entry) |
                        (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
                flags = PM_SWAP;
                if (is_migration_entry(entry))
                        page = migration_entry_to_page(entry);
        } else {
-                *pme = make_pme(PM_NOT_PRESENT);
+                *pme = make_pme(PM_NOT_PRESENT(pm->v2));
                return;
        }
        if (page && !PageAnon(page))
                flags |= PM_FILE;
+        if (pte_soft_dirty(pte))
+                flags2 |= __PM_SOFT_DIRTY;
-        *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags);
+        *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
+static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
-                                        pmd_t pmd, int offset)
+                pmd_t pmd, int offset, int pmd_flags2)
 {
        /*
         * Currently pmd for thp is always present because thp can not be
@@ -887,13 +972,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
         */
        if (pmd_present(pmd))
                *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
-                                | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+                                | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
        else
-                *pme = make_pme(PM_NOT_PRESENT);
+                *pme = make_pme(PM_NOT_PRESENT(pm->v2));
 }
 #else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
+static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
-                                                pmd_t pmd, int offset)
+                pmd_t pmd, int offset, int pmd_flags2)
 {
 }
 #endif
@@ -905,17 +990,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        struct pagemapread *pm = walk->private;
        pte_t *pte;
        int err = 0;
-        pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+        pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
        /* find the first VMA at or above 'addr' */
        vma = find_vma(walk->mm, addr);
        if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+                int pmd_flags2;
+                pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
                for (; addr != end; addr += PAGE_SIZE) {
                        unsigned long offset;
                        offset = (addr & ~PAGEMAP_WALK_MASK) >>
                                        PAGE_SHIFT;
-                        thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
+                        thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
                        err = add_to_pagemap(addr, &pme, pm);
                        if (err)
                                break;
@@ -932,7 +1020,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                 * and need a new, higher one */
                if (vma && (addr >= vma->vm_end)) {
                        vma = find_vma(walk->mm, addr);
-                        pme = make_pme(PM_NOT_PRESENT);
+                        pme = make_pme(PM_NOT_PRESENT(pm->v2));
                }
                /* check that 'vma' actually covers this address,
@@ -940,7 +1028,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                if (vma && (vma->vm_start <= addr) &&
                    !is_vm_hugetlb_page(vma)) {
                        pte = pte_offset_map(pmd, addr);
-                        pte_to_pagemap_entry(&pme, vma, addr, *pte);
+                        pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
                        /* unmap before userspace copy */
                        pte_unmap(pte);
                }
@@ -955,14 +1043,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 }
 #ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
                                        pte_t pte, int offset)
 {
        if (pte_present(pte))
                *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
-                                | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+                                | PM_STATUS2(pm->v2, 0) | PM_PRESENT);
        else
-                *pme = make_pme(PM_NOT_PRESENT);
+                *pme = make_pme(PM_NOT_PRESENT(pm->v2));
 }
 /* This function walks within one hugetlb entry in the single call */
@@ -976,7 +1064,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
        for (; addr != end; addr += PAGE_SIZE) {
                int offset = (addr & ~hmask) >> PAGE_SHIFT;
-                huge_pte_to_pagemap_entry(&pme, *pte, offset);
+                huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
                err = add_to_pagemap(addr, &pme, pm);
                if (err)
                        return err;
@@ -1038,8 +1126,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!count)
                goto out_task;
-        pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
+        pm.v2 = soft_dirty_cleared;
-        pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
+        pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
+        pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
        ret = -ENOMEM;
        if (!pm.buffer)
                goto out_task;
@@ -1110,9 +1199,18 @@ out:
        return ret;
 }
+static int pagemap_open(struct inode *inode, struct file *file)
+{
+        pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
+                        "to stop being page-shift some time soon. See the "
+                        "linux/Documentation/vm/pagemap.txt for details.\n");
+        return 0;
+}
 const struct file_operations proc_pagemap_operations = {
        .llseek         = mem_lseek, /* borrow this */
        .read           = pagemap_read,
+        .open           = pagemap_open,
 };
 #endif /* CONFIG_PROC_PAGE_MONITOR */
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 9610ac772d7e..061894625903 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -20,8 +20,7 @@ static int uptime_proc_show(struct seq_file *m, void *v)
        for_each_possible_cpu(i)
                idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
-        do_posix_clock_monotonic_gettime(&uptime);
+        get_monotonic_boottime(&uptime);
-        monotonic_to_bootbased(&uptime);
        nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
        idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
        idle.tv_nsec = rem;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 17f7e080d7ff..a1a16eb97c7b 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/crash_dump.h>
 #include <linux/list.h>
+#include <linux/vmalloc.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include "internal.h"
@@ -32,6 +33,10 @@ static LIST_HEAD(vmcore_list);
 /* Stores the pointer to the buffer containing kernel elf core headers. */
 static char *elfcorebuf;
 static size_t elfcorebuf_sz;
+static size_t elfcorebuf_sz_orig;
+static char *elfnotes_buf;
+static size_t elfnotes_sz;
 /* Total size of vmcore file. */
 static u64 vmcore_size;
@@ -118,27 +123,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
        return read;
 }
-/* Maps vmcore file offset to respective physical address in memroy. */
-static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
-                                        struct vmcore **m_ptr)
-{
-        struct vmcore *m;
-        u64 paddr;
-        list_for_each_entry(m, vc_list, list) {
-                u64 start, end;
-                start = m->offset;
-                end = m->offset + m->size - 1;
-                if (offset >= start && offset <= end) {
-                        paddr = m->paddr + offset - start;
-                        *m_ptr = m;
-                        return paddr;
-                }
-        }
-        *m_ptr = NULL;
-        return 0;
-}
 /* Read from the ELF header and then the crash dump. On error, negative value is
 * returned otherwise number of bytes read are returned.
 */
@@ -147,8 +131,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 {
        ssize_t acc = 0, tmp;
        size_t tsz;
-        u64 start, nr_bytes;
+        u64 start;
-        struct vmcore *curr_m = NULL;
+        struct vmcore *m = NULL;
        if (buflen == 0 || *fpos >= vmcore_size)
                return 0;
@@ -159,9 +143,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
        /* Read ELF core header */
        if (*fpos < elfcorebuf_sz) {
-                tsz = elfcorebuf_sz - *fpos;
+                tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
-                if (buflen < tsz)
-                        tsz = buflen;
                if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
                        return -EFAULT;
                buflen -= tsz;
@@ -174,39 +156,161 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
                        return acc;
        }
-        start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m);
+        /* Read Elf note segment */
-        if (!curr_m)
+        if (*fpos < elfcorebuf_sz + elfnotes_sz) {
-                return -EINVAL;
+                void *kaddr;
-        while (buflen) {
-                tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK));
-                /* Calculate left bytes in current memory segment. */
+                tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
-                nr_bytes = (curr_m->size - (start - curr_m->paddr));
+                kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
-                if (tsz > nr_bytes)
+                if (copy_to_user(buffer, kaddr, tsz))
-                        tsz = nr_bytes;
+                        return -EFAULT;
-                tmp = read_from_oldmem(buffer, tsz, &start, 1);
-                if (tmp < 0)
-                        return tmp;
                buflen -= tsz;
                *fpos += tsz;
                buffer += tsz;
                acc += tsz;
-                if (start >= (curr_m->paddr + curr_m->size)) {
-                        if (curr_m->list.next == &vmcore_list)
+                /* leave now if filled buffer already */
-                                return acc;     /*EOF*/
+                if (buflen == 0)
-                        curr_m = list_entry(curr_m->list.next,
+                        return acc;
-                                                struct vmcore, list);
+        }
-                        start = curr_m->paddr;
+        list_for_each_entry(m, &vmcore_list, list) {
+                if (*fpos < m->offset + m->size) {
+                        tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
+                        start = m->paddr + *fpos - m->offset;
+                        tmp = read_from_oldmem(buffer, tsz, &start, 1);
+                        if (tmp < 0)
+                                return tmp;
+                        buflen -= tsz;
+                        *fpos += tsz;
+                        buffer += tsz;
+                        acc += tsz;
+                        /* leave now if filled buffer already */
+                        if (buflen == 0)
+                                return acc;
                }
        }
        return acc;
 }
+/**
+ * alloc_elfnotes_buf - allocate buffer for ELF note segment in
+ *                      vmalloc memory
+ *
+ * @notes_sz: size of buffer
+ *
+ * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
+ * the buffer to user-space by means of remap_vmalloc_range().
+ *
+ * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
+ * disabled and there's no need to allow users to mmap the buffer.
+ */
+static inline char *alloc_elfnotes_buf(size_t notes_sz)
+{
+#ifdef CONFIG_MMU
+        return vmalloc_user(notes_sz);
+#else
+        return vzalloc(notes_sz);
+#endif
+}
+/*
+ * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
+ * essential for mmap_vmcore() in order to map physically
+ * non-contiguous objects (ELF header, ELF note segment and memory
+ * regions in the 1st kernel pointed to by PT_LOAD entries) into
+ * virtually contiguous user-space in ELF layout.
+ */
+#if defined(CONFIG_MMU) && !defined(CONFIG_S390)
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+        size_t size = vma->vm_end - vma->vm_start;
+        u64 start, end, len, tsz;
+        struct vmcore *m;
+        start = (u64)vma->vm_pgoff << PAGE_SHIFT;
+        end = start + size;
+        if (size > vmcore_size || end > vmcore_size)
+                return -EINVAL;
+        if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+                return -EPERM;
+        vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+        vma->vm_flags |= VM_MIXEDMAP;
+        len = 0;
+        if (start < elfcorebuf_sz) {
+                u64 pfn;
+                tsz = min(elfcorebuf_sz - (size_t)start, size);
+                pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT;
+                if (remap_pfn_range(vma, vma->vm_start, pfn, tsz,
+                                    vma->vm_page_prot))
+                        return -EAGAIN;
+                size -= tsz;
+                start += tsz;
+                len += tsz;
+                if (size == 0)
+                        return 0;
+        }
+        if (start < elfcorebuf_sz + elfnotes_sz) {
+                void *kaddr;
+                tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
+                kaddr = elfnotes_buf + start - elfcorebuf_sz;
+                if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
+                                                kaddr, tsz))
+                        goto fail;
+                size -= tsz;
+                start += tsz;
+                len += tsz;
+                if (size == 0)
+                        return 0;
+        }
+        list_for_each_entry(m, &vmcore_list, list) {
+                if (start < m->offset + m->size) {
+                        u64 paddr = 0;
+                        tsz = min_t(size_t, m->offset + m->size - start, size);
+                        paddr = m->paddr + start - m->offset;
+                        if (remap_pfn_range(vma, vma->vm_start + len,
+                                            paddr >> PAGE_SHIFT, tsz,
+                                            vma->vm_page_prot))
+                                goto fail;
+                        size -= tsz;
+                        start += tsz;
+                        len += tsz;
+                        if (size == 0)
+                                return 0;
+                }
+        }
+        return 0;
+fail:
+        do_munmap(vma->vm_mm, vma->vm_start, len);
+        return -EAGAIN;
+}
+#else
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+        return -ENOSYS;
+}
+#endif
 static const struct file_operations proc_vmcore_operations = {
        .read           = read_vmcore,
        .llseek         = default_llseek,
+        .mmap           = mmap_vmcore,
 };
 static struct vmcore* __init get_new_element(void)
@@ -214,61 +318,40 @@ static struct vmcore* __init get_new_element(void)
        return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
 }
-static u64 __init get_vmcore_size_elf64(char *elfptr)
+static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
+                                  struct list_head *vc_list)
 {
-        int i;
-        u64 size;
-        Elf64_Ehdr *ehdr_ptr;
-        Elf64_Phdr *phdr_ptr;
-        ehdr_ptr = (Elf64_Ehdr *)elfptr;
-        phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
-        size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
-        for (i = 0; i < ehdr_ptr->e_phnum; i++) {
-                size += phdr_ptr->p_memsz;
-                phdr_ptr++;
-        }
-        return size;
-}
-static u64 __init get_vmcore_size_elf32(char *elfptr)
-{
-        int i;
        u64 size;
-        Elf32_Ehdr *ehdr_ptr;
+        struct vmcore *m;
-        Elf32_Phdr *phdr_ptr;
-        ehdr_ptr = (Elf32_Ehdr *)elfptr;
+        size = elfsz + elfnotesegsz;
-        phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
+        list_for_each_entry(m, vc_list, list) {
-        size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr));
+                size += m->size;
-        for (i = 0; i < ehdr_ptr->e_phnum; i++) {
-                size += phdr_ptr->p_memsz;
-                phdr_ptr++;
        }
        return size;
 }
-/* Merges all the PT_NOTE headers into one. */
+/**
-static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
+ * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry
-                                                struct list_head *vc_list)
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
 {
-        int i, nr_ptnote=0, rc=0;
+        int i, rc=0;
-        char *tmp;
+        Elf64_Phdr *phdr_ptr;
-        Elf64_Ehdr *ehdr_ptr;
-        Elf64_Phdr phdr, *phdr_ptr;
        Elf64_Nhdr *nhdr_ptr;
-        u64 phdr_sz = 0, note_off;
-        ehdr_ptr = (Elf64_Ehdr *)elfptr;
+        phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
-        phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
        for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
-                int j;
                void *notes_section;
-                struct vmcore *new;
                u64 offset, max_sz, sz, real_sz = 0;
                if (phdr_ptr->p_type != PT_NOTE)
                        continue;
-                nr_ptnote++;
                max_sz = phdr_ptr->p_memsz;
                offset = phdr_ptr->p_offset;
                notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -280,7 +363,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
                        return rc;
                }
                nhdr_ptr = notes_section;
-                for (j = 0; j < max_sz; j += sz) {
+                while (real_sz < max_sz) {
                        if (nhdr_ptr->n_namesz == 0)
                                break;
                        sz = sizeof(Elf64_Nhdr) +
@@ -289,26 +372,122 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
                        real_sz += sz;
                        nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
                }
-                /* Add this contiguous chunk of notes section to vmcore list.*/
-                new = get_new_element();
-                if (!new) {
-                        kfree(notes_section);
-                        return -ENOMEM;
-                }
-                new->paddr = phdr_ptr->p_offset;
-                new->size = real_sz;
-                list_add_tail(&new->list, vc_list);
-                phdr_sz += real_sz;
                kfree(notes_section);
+                phdr_ptr->p_memsz = real_sz;
+        }
+        return 0;
+}
+/**
+ * get_note_number_and_size_elf64 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr,
+                                                 int *nr_ptnote, u64 *sz_ptnote)
+{
+        int i;
+        Elf64_Phdr *phdr_ptr;
+        *nr_ptnote = *sz_ptnote = 0;
+        phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
+        for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+                if (phdr_ptr->p_type != PT_NOTE)
+                        continue;
+                *nr_ptnote += 1;
+                *sz_ptnote += phdr_ptr->p_memsz;
+        }
+        return 0;
+}
+/**
+ * copy_notes_elf64 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
+{
+        int i, rc=0;
+        Elf64_Phdr *phdr_ptr;
+        phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1);
+        for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+                u64 offset;
+                if (phdr_ptr->p_type != PT_NOTE)
+                        continue;
+                offset = phdr_ptr->p_offset;
+                rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
+                if (rc < 0)
+                        return rc;
+                notes_buf += phdr_ptr->p_memsz;
        }
+        return 0;
+}
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
+                                           char **notes_buf, size_t *notes_sz)
+{
+        int i, nr_ptnote=0, rc=0;
+        char *tmp;
+        Elf64_Ehdr *ehdr_ptr;
+        Elf64_Phdr phdr;
+        u64 phdr_sz = 0, note_off;
+        ehdr_ptr = (Elf64_Ehdr *)elfptr;
+        rc = update_note_header_size_elf64(ehdr_ptr);
+        if (rc < 0)
+                return rc;
+        rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz);
+        if (rc < 0)
+                return rc;
+        *notes_sz = roundup(phdr_sz, PAGE_SIZE);
+        *notes_buf = alloc_elfnotes_buf(*notes_sz);
+        if (!*notes_buf)
+                return -ENOMEM;
+        rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
+        if (rc < 0)
+                return rc;
        /* Prepare merged PT_NOTE program header. */
        phdr.p_type    = PT_NOTE;
        phdr.p_flags   = 0;
        note_off = sizeof(Elf64_Ehdr) +
                        (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
-        phdr.p_offset  = note_off;
+        phdr.p_offset  = roundup(note_off, PAGE_SIZE);
        phdr.p_vaddr   = phdr.p_paddr = 0;
        phdr.p_filesz  = phdr.p_memsz = phdr_sz;
        phdr.p_align   = 0;
@@ -322,6 +501,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
        i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
        *elfsz = *elfsz - i;
        memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
+        memset(elfptr + *elfsz, 0, i);
+        *elfsz = roundup(*elfsz, PAGE_SIZE);
        /* Modify e_phnum to reflect merged headers. */
        ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -329,27 +510,27 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
        return 0;
 }
-/* Merges all the PT_NOTE headers into one. */
+/**
-static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
+ * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry
-                                                struct list_head *vc_list)
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
 {
-        int i, nr_ptnote=0, rc=0;
+        int i, rc=0;
-        char *tmp;
+        Elf32_Phdr *phdr_ptr;
-        Elf32_Ehdr *ehdr_ptr;
-        Elf32_Phdr phdr, *phdr_ptr;
        Elf32_Nhdr *nhdr_ptr;
-        u64 phdr_sz = 0, note_off;
-        ehdr_ptr = (Elf32_Ehdr *)elfptr;
+        phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
-        phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
        for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
-                int j;
                void *notes_section;
-                struct vmcore *new;
                u64 offset, max_sz, sz, real_sz = 0;
                if (phdr_ptr->p_type != PT_NOTE)
                        continue;
-                nr_ptnote++;
                max_sz = phdr_ptr->p_memsz;
                offset = phdr_ptr->p_offset;
                notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -361,7 +542,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
                        return rc;
                }
                nhdr_ptr = notes_section;
-                for (j = 0; j < max_sz; j += sz) {
+                while (real_sz < max_sz) {
                        if (nhdr_ptr->n_namesz == 0)
                                break;
                        sz = sizeof(Elf32_Nhdr) +
@@ -370,26 +551,122 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
                        real_sz += sz;
                        nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
                }
-                /* Add this contiguous chunk of notes section to vmcore list.*/
-                new = get_new_element();
-                if (!new) {
-                        kfree(notes_section);
-                        return -ENOMEM;
-                }
-                new->paddr = phdr_ptr->p_offset;
-                new->size = real_sz;
-                list_add_tail(&new->list, vc_list);
-                phdr_sz += real_sz;
                kfree(notes_section);
+                phdr_ptr->p_memsz = real_sz;
+        }
+        return 0;
+}
+/**
+ * get_note_number_and_size_elf32 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr,
+                                                 int *nr_ptnote, u64 *sz_ptnote)
+{
+        int i;
+        Elf32_Phdr *phdr_ptr;
+        *nr_ptnote = *sz_ptnote = 0;
+        phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
+        for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+                if (phdr_ptr->p_type != PT_NOTE)
+                        continue;
+                *nr_ptnote += 1;
+                *sz_ptnote += phdr_ptr->p_memsz;
+        }
+        return 0;
+}
+/**
+ * copy_notes_elf32 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
+{
+        int i, rc=0;
+        Elf32_Phdr *phdr_ptr;
+        phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1);
+        for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+                u64 offset;
+                if (phdr_ptr->p_type != PT_NOTE)
+                        continue;
+                offset = phdr_ptr->p_offset;
+                rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
+                if (rc < 0)
+                        return rc;
+                notes_buf += phdr_ptr->p_memsz;
        }
+        return 0;
+}
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
+                                           char **notes_buf, size_t *notes_sz)
+{
+        int i, nr_ptnote=0, rc=0;
+        char *tmp;
+        Elf32_Ehdr *ehdr_ptr;
+        Elf32_Phdr phdr;
+        u64 phdr_sz = 0, note_off;
+        ehdr_ptr = (Elf32_Ehdr *)elfptr;
+        rc = update_note_header_size_elf32(ehdr_ptr);
+        if (rc < 0)
+                return rc;
+        rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz);
+        if (rc < 0)
+                return rc;
+        *notes_sz = roundup(phdr_sz, PAGE_SIZE);
+        *notes_buf = alloc_elfnotes_buf(*notes_sz);
+        if (!*notes_buf)
+                return -ENOMEM;
+        rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
+        if (rc < 0)
+                return rc;
        /* Prepare merged PT_NOTE program header. */
        phdr.p_type    = PT_NOTE;
        phdr.p_flags   = 0;
        note_off = sizeof(Elf32_Ehdr) +
                        (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
-        phdr.p_offset  = note_off;
+        phdr.p_offset  = roundup(note_off, PAGE_SIZE);
        phdr.p_vaddr   = phdr.p_paddr = 0;
        phdr.p_filesz  = phdr.p_memsz = phdr_sz;
        phdr.p_align   = 0;
@@ -403,6 +680,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
        i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
        *elfsz = *elfsz - i;
        memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
+        memset(elfptr + *elfsz, 0, i);
+        *elfsz = roundup(*elfsz, PAGE_SIZE);
        /* Modify e_phnum to reflect merged headers. */
        ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -414,6 +693,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 * the new offset fields of exported program headers. */
 static int __init process_ptload_program_headers_elf64(char *elfptr,
                                                size_t elfsz,
+                                                size_t elfnotes_sz,
                                                struct list_head *vc_list)
 {
        int i;
@@ -425,32 +705,38 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
        ehdr_ptr = (Elf64_Ehdr *)elfptr;
        phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
-        /* First program header is PT_NOTE header. */
+        /* Skip Elf header, program headers and Elf note segment. */
-        vmcore_off = sizeof(Elf64_Ehdr) +
+        vmcore_off = elfsz + elfnotes_sz;
-                        (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
-                        phdr_ptr->p_memsz; /* Note sections */
        for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+                u64 paddr, start, end, size;
                if (phdr_ptr->p_type != PT_LOAD)
                        continue;
+                paddr = phdr_ptr->p_offset;
+                start = rounddown(paddr, PAGE_SIZE);
+                end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+                size = end - start;
                /* Add this contiguous chunk of memory to vmcore list.*/
                new = get_new_element();
                if (!new)
                        return -ENOMEM;
-                new->paddr = phdr_ptr->p_offset;
+                new->paddr = start;
-                new->size = phdr_ptr->p_memsz;
+                new->size = size;
                list_add_tail(&new->list, vc_list);
                /* Update the program header offset. */
-                phdr_ptr->p_offset = vmcore_off;
+                phdr_ptr->p_offset = vmcore_off + (paddr - start);
-                vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+                vmcore_off = vmcore_off + size;
        }
        return 0;
 }
 static int __init process_ptload_program_headers_elf32(char *elfptr,
                                                size_t elfsz,
+                                                size_t elfnotes_sz,
                                                struct list_head *vc_list)
 {
        int i;
@@ -462,43 +748,44 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
        ehdr_ptr = (Elf32_Ehdr *)elfptr;
        phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
-        /* First program header is PT_NOTE header. */
+        /* Skip Elf header, program headers and Elf note segment. */
-        vmcore_off = sizeof(Elf32_Ehdr) +
+        vmcore_off = elfsz + elfnotes_sz;
-                        (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) +
-                        phdr_ptr->p_memsz; /* Note sections */
        for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+                u64 paddr, start, end, size;
                if (phdr_ptr->p_type != PT_LOAD)
                        continue;
+                paddr = phdr_ptr->p_offset;
+                start = rounddown(paddr, PAGE_SIZE);
+                end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+                size = end - start;
                /* Add this contiguous chunk of memory to vmcore list.*/
                new = get_new_element();
                if (!new)
                        return -ENOMEM;
-                new->paddr = phdr_ptr->p_offset;
+                new->paddr = start;
-                new->size = phdr_ptr->p_memsz;
+                new->size = size;
                list_add_tail(&new->list, vc_list);
                /* Update the program header offset */
-                phdr_ptr->p_offset = vmcore_off;
+                phdr_ptr->p_offset = vmcore_off + (paddr - start);
-                vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+                vmcore_off = vmcore_off + size;
        }
        return 0;
 }
 /* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets_elf64(char *elfptr,
+static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
-                                                struct list_head *vc_list)
+                                           struct list_head *vc_list)
 {
        loff_t vmcore_off;
-        Elf64_Ehdr *ehdr_ptr;
        struct vmcore *m;
-        ehdr_ptr = (Elf64_Ehdr *)elfptr;
+        /* Skip Elf header, program headers and Elf note segment. */
+        vmcore_off = elfsz + elfnotes_sz;
-        /* Skip Elf header and program headers. */
-        vmcore_off = sizeof(Elf64_Ehdr) +
-                        (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
        list_for_each_entry(m, vc_list, list) {
                m->offset = vmcore_off;
@@ -506,24 +793,12 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr,
        }
 }
-/* Sets offset fields of vmcore elements. */
+static void free_elfcorebuf(void)
-static void __init set_vmcore_list_offsets_elf32(char *elfptr,
-                                                struct list_head *vc_list)
 {
-        loff_t vmcore_off;
+        free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
-        Elf32_Ehdr *ehdr_ptr;
+        elfcorebuf = NULL;
-        struct vmcore *m;
+        vfree(elfnotes_buf);
+        elfnotes_buf = NULL;
-        ehdr_ptr = (Elf32_Ehdr *)elfptr;
-        /* Skip Elf header and program headers. */
-        vmcore_off = sizeof(Elf32_Ehdr) +
-                        (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr);
-        list_for_each_entry(m, vc_list, list) {
-                m->offset = vmcore_off;
-                vmcore_off += m->size;
-        }
 }
 static int __init parse_crash_elf64_headers(void)
@@ -554,31 +829,32 @@ static int __init parse_crash_elf64_headers(void)
        }
        /* Read in all elf headers. */
-        elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr);
+        elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) +
-        elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+                                ehdr.e_phnum * sizeof(Elf64_Phdr);
+        elfcorebuf_sz = elfcorebuf_sz_orig;
+        elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                              get_order(elfcorebuf_sz_orig));
        if (!elfcorebuf)
                return -ENOMEM;
        addr = elfcorehdr_addr;
-        rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
+        rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
-        if (rc < 0) {
+        if (rc < 0)
-                kfree(elfcorebuf);
+                goto fail;
-                return rc;
-        }
        /* Merge all PT_NOTE headers into one. */
-        rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
+        rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz,
-        if (rc) {
+                                      &elfnotes_buf, &elfnotes_sz);
-                kfree(elfcorebuf);
+        if (rc)
-                return rc;
+                goto fail;
-        }
        rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
-                                                        &vmcore_list);
+                                                  elfnotes_sz, &vmcore_list);
-        if (rc) {
+        if (rc)
-                kfree(elfcorebuf);
+                goto fail;
-                return rc;
+        set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
-        }
-        set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list);
        return 0;
+fail:
+        free_elfcorebuf();
+        return rc;
 }
 static int __init parse_crash_elf32_headers(void)
@@ -609,31 +885,31 @@ static int __init parse_crash_elf32_headers(void)
        }
        /* Read in all elf headers. */
-        elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
+        elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
-        elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+        elfcorebuf_sz = elfcorebuf_sz_orig;
+        elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                              get_order(elfcorebuf_sz_orig));
        if (!elfcorebuf)
                return -ENOMEM;
        addr = elfcorehdr_addr;
-        rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
+        rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
-        if (rc < 0) {
+        if (rc < 0)
-                kfree(elfcorebuf);
+                goto fail;
-                return rc;
-        }
        /* Merge all PT_NOTE headers into one. */
-        rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
+        rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz,
-        if (rc) {
+                                      &elfnotes_buf, &elfnotes_sz);
-                kfree(elfcorebuf);
+        if (rc)
-                return rc;
+                goto fail;
-        }
        rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
-                                                                &vmcore_list);
+                                                  elfnotes_sz, &vmcore_list);
-        if (rc) {
+        if (rc)
-                kfree(elfcorebuf);
+                goto fail;
-                return rc;
+        set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
-        }
-        set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list);
        return 0;
+fail:
+        free_elfcorebuf();
+        return rc;
 }
 static int __init parse_crash_elf_headers(void)
@@ -655,20 +931,19 @@ static int __init parse_crash_elf_headers(void)
                rc = parse_crash_elf64_headers();
                if (rc)
                        return rc;
-                /* Determine vmcore size. */
-                vmcore_size = get_vmcore_size_elf64(elfcorebuf);
        } else if (e_ident[EI_CLASS] == ELFCLASS32) {
                rc = parse_crash_elf32_headers();
                if (rc)
                        return rc;
-                /* Determine vmcore size. */
-                vmcore_size = get_vmcore_size_elf32(elfcorebuf);
        } else {
                pr_warn("Warning: Core image elf header is not sane\n");
                return -EINVAL;
        }
+        /* Determine vmcore size. */
+        vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+                                      &vmcore_list);
        return 0;
 }
@@ -711,7 +986,6 @@ void vmcore_cleanup(void)
                list_del(&m->list);
                kfree(m);
        }
-        kfree(elfcorebuf);
+        free_elfcorebuf();
-        elfcorebuf = NULL;
 }
 EXPORT_SYMBOL_GPL(vmcore_cleanup);
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 43b12807a51d..76a4eeb92982 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -44,7 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
        rec.parent_ip = parent_ip;
        pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
        psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
-                          sizeof(rec), psinfo);
+                          0, sizeof(rec), psinfo);
        local_irq_restore(flags);
 }
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index e4bcb2cf055a..71bf5f4ae84c 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -178,6 +178,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
        if (p->psi->erase)
                p->psi->erase(p->type, p->id, p->count,
                              dentry->d_inode->i_ctime, p->psi);
+        else
+                return -EPERM;
        return simple_unlink(dir, dentry);
 }
@@ -324,6 +326,15 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
        case PSTORE_TYPE_MCE:
                sprintf(name, "mce-%s-%lld", psname, id);
                break;
+        case PSTORE_TYPE_PPC_RTAS:
+                sprintf(name, "rtas-%s-%lld", psname, id);
+                break;
+        case PSTORE_TYPE_PPC_OF:
+                sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
+                break;
+        case PSTORE_TYPE_PPC_COMMON:
+                sprintf(name, "powerpc-common-%s-%lld", psname, id);
+                break;
        case PSTORE_TYPE_UNKNOWN:
                sprintf(name, "unknown-%s-%lld", psname, id);
                break;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 86d1038b5a12..422962ae9fc2 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -159,7 +159,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
                        break;
                ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
-                                    oopscount, hsize + len, psinfo);
+                                    oopscount, hsize, hsize + len, psinfo);
                if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
                        pstore_new_entry = 1;
@@ -196,7 +196,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
                        spin_lock_irqsave(&psinfo->buf_lock, flags);
                }
                memcpy(psinfo->buf, s, c);
-                psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo);
+                psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);
                spin_unlock_irqrestore(&psinfo->buf_lock, flags);
                s += c;
                c = e - s;
@@ -221,9 +221,11 @@ static void pstore_register_console(void) {}
 static int pstore_write_compat(enum pstore_type_id type,
                               enum kmsg_dump_reason reason,
                               u64 *id, unsigned int part, int count,
-                               size_t size, struct pstore_info *psi)
+                               size_t hsize, size_t size,
+                               struct pstore_info *psi)
 {
-        return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi);
+        return psi->write_buf(type, reason, id, part, psinfo->buf, hsize,
+                             size, psi);
 }
 /*
@@ -239,17 +241,15 @@ int pstore_register(struct pstore_info *psi)
 {
        struct module *owner = psi->owner;
+        if (backend && strcmp(backend, psi->name))
+                return -EPERM;
        spin_lock(&pstore_lock);
        if (psinfo) {
                spin_unlock(&pstore_lock);
                return -EBUSY;
        }
-        if (backend && strcmp(backend, psi->name)) {
-                spin_unlock(&pstore_lock);
-                return -EINVAL;
-        }
        if (!psi->write)
                psi->write = pstore_write_compat;
        psinfo = psi;
@@ -274,6 +274,9 @@ int pstore_register(struct pstore_info *psi)
                add_timer(&pstore_timer);
        }
+        pr_info("pstore: Registered %s as persistent store backend\n",
+                psi->name);
        return 0;
 }
 EXPORT_SYMBOL_GPL(pstore_register);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 1376e5a8f0d6..a6119f9469e2 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -195,7 +195,8 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
 static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
                                            enum kmsg_dump_reason reason,
                                            u64 *id, unsigned int part,
-                                            const char *buf, size_t size,
+                                            const char *buf,
+                                            size_t hsize, size_t size,
                                            struct pstore_info *psi)
 {
        struct ramoops_context *cxt = psi->data;
@@ -399,8 +400,6 @@ static int ramoops_probe(struct platform_device *pdev)
                goto fail_out;
        }
-        if (!is_power_of_2(pdata->mem_size))
-                pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
        if (!is_power_of_2(pdata->record_size))
                pdata->record_size = rounddown_pow_of_two(pdata->record_size);
        if (!is_power_of_2(pdata->console_size))
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 59337326e288..de272d426763 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -46,7 +46,7 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
 }
 /* increase and wrap the start pointer, returning the old value */
-static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
+static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
 {
        int old;
        int new;
@@ -62,7 +62,7 @@ static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
 }
 /* increase the size counter until it hits the max size */
-static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
+static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a)
 {
        size_t old;
        size_t new;
@@ -78,6 +78,53 @@ static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
        } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old);
 }
+static DEFINE_RAW_SPINLOCK(buffer_lock);
+/* increase and wrap the start pointer, returning the old value */
+static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
+{
+        int old;
+        int new;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&buffer_lock, flags);
+        old = atomic_read(&prz->buffer->start);
+        new = old + a;
+        while (unlikely(new > prz->buffer_size))
+                new -= prz->buffer_size;
+        atomic_set(&prz->buffer->start, new);
+        raw_spin_unlock_irqrestore(&buffer_lock, flags);
+        return old;
+}
+/* increase the size counter until it hits the max size */
+static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a)
+{
+        size_t old;
+        size_t new;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&buffer_lock, flags);
+        old = atomic_read(&prz->buffer->size);
+        if (old == prz->buffer_size)
+                goto exit;
+        new = old + a;
+        if (new > prz->buffer_size)
+                new = prz->buffer_size;
+        atomic_set(&prz->buffer->size, new);
+exit:
+        raw_spin_unlock_irqrestore(&buffer_lock, flags);
+}
+static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic;
+static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic;
 static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
        uint8_t *data, size_t len, uint8_t *ecc)
 {
@@ -372,6 +419,9 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size)
                return NULL;
        }
+        buffer_start_add = buffer_start_add_locked;
+        buffer_size_add = buffer_size_add_locked;
        return ioremap(start, size);
 }
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 28ce014b3cef..b218f965817b 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -14,9 +14,9 @@
 #include <linux/buffer_head.h>
 #include "qnx4.h"
-static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int qnx4_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        unsigned int offset;
        struct buffer_head *bh;
        struct qnx4_inode_entry *de;
@@ -26,48 +26,44 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int size;
        QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
-        QNX4DEBUG((KERN_INFO "filp->f_pos         = %ld\n", (long) filp->f_pos));
+        QNX4DEBUG((KERN_INFO "pos                 = %ld\n", (long) ctx->pos));
-        while (filp->f_pos < inode->i_size) {
+        while (ctx->pos < inode->i_size) {
-                blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
+                blknum = qnx4_block_map(inode, ctx->pos >> QNX4_BLOCK_SIZE_BITS);
                bh = sb_bread(inode->i_sb, blknum);
-                if(bh==NULL) {
+                if (bh == NULL) {
                        printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum);
-                        break;
+                        return 0;
                }
-                ix = (int)(filp->f_pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
+                ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
-                while (ix < QNX4_INODES_PER_BLOCK) {
+                for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) {
                        offset = ix * QNX4_DIR_ENTRY_SIZE;
                        de = (struct qnx4_inode_entry *) (bh->b_data + offset);
-                        size = strlen(de->di_fname);
+                        if (!de->di_fname[0])
-                        if (size) {
+                                continue;
-                                if ( !( de->di_status & QNX4_FILE_LINK ) && size > QNX4_SHORT_NAME_MAX )
+                        if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
-                                        size = QNX4_SHORT_NAME_MAX;
+                                continue;
-                                else if ( size > QNX4_NAME_MAX )
+                        if (!(de->di_status & QNX4_FILE_LINK))
-                                        size = QNX4_NAME_MAX;
+                                size = QNX4_SHORT_NAME_MAX;
+                        else
-                                if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) {
+                                size = QNX4_NAME_MAX;
-                                        QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
+                        size = strnlen(de->di_fname, size);
-                                        if ( ( de->di_status & QNX4_FILE_LINK ) == 0 )
+                        QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
-                                                ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
+                        if (!(de->di_status & QNX4_FILE_LINK))
-                                        else {
+                                ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
-                                                le  = (struct qnx4_link_info*)de;
+                        else {
-                                                ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
+                                le  = (struct qnx4_link_info*)de;
-                                                        QNX4_INODES_PER_BLOCK +
+                                ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
-                                                        le->dl_inode_ndx;
+                                        QNX4_INODES_PER_BLOCK +
-                                        }
+                                        le->dl_inode_ndx;
-                                        if (filldir(dirent, de->di_fname, size, filp->f_pos, ino, DT_UNKNOWN) < 0) {
+                        }
-                                                brelse(bh);
+                        if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) {
-                                                goto out;
+                                brelse(bh);
-                                        }
+                                return 0;
-                                }
                        }
-                        ix++;
-                        filp->f_pos += QNX4_DIR_ENTRY_SIZE;
                }
                brelse(bh);
        }
-out:
        return 0;
 }
@@ -75,7 +71,7 @@ const struct file_operations qnx4_dir_operations =
 {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = qnx4_readdir,
+        .iterate        = qnx4_readdir,
        .fsync          = generic_file_fsync,
 };
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index afa6be6fc397..15b7d92ed60d 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -65,8 +65,8 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb,
 static int qnx6_dir_longfilename(struct inode *inode,
                        struct qnx6_long_dir_entry *de,
-                        void *dirent, loff_t pos,
+                        struct dir_context *ctx,
-                        unsigned de_inode, filldir_t filldir)
+                        unsigned de_inode)
 {
        struct qnx6_long_filename *lf;
        struct super_block *s = inode->i_sb;
@@ -104,8 +104,7 @@ static int qnx6_dir_longfilename(struct inode *inode,
        QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n",
                                        lf_size, lf->lf_fname, de_inode));
-        if (filldir(dirent, lf->lf_fname, lf_size, pos, de_inode,
+        if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) {
-                        DT_UNKNOWN) < 0) {
                qnx6_put_page(page);
                return 0;
        }
@@ -115,18 +114,19 @@ static int qnx6_dir_longfilename(struct inode *inode,
        return 1;
 }
-static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int qnx6_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *s = inode->i_sb;
        struct qnx6_sb_info *sbi = QNX6_SB(s);
-        loff_t pos = filp->f_pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
+        loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
        unsigned long npages = dir_pages(inode);
        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE;
        bool done = false;
-        if (filp->f_pos >= inode->i_size)
+        ctx->pos = pos;
+        if (ctx->pos >= inode->i_size)
                return 0;
        for ( ; !done && n < npages; n++, start = 0) {
@@ -137,11 +137,11 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
                if (IS_ERR(page)) {
                        printk(KERN_ERR "qnx6_readdir: read failed\n");
-                        filp->f_pos = (n + 1) << PAGE_CACHE_SHIFT;
+                        ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
                        return PTR_ERR(page);
                }
                de = ((struct qnx6_dir_entry *)page_address(page)) + start;
-                for (; i < limit; i++, de++, pos += QNX6_DIR_ENTRY_SIZE) {
+                for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) {
                        int size = de->de_size;
                        u32 no_inode = fs32_to_cpu(sbi, de->de_inode);
@@ -154,8 +154,7 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                   structure / block */
                                if (!qnx6_dir_longfilename(inode,
                                        (struct qnx6_long_dir_entry *)de,
-                                        dirent, pos, no_inode,
+                                        ctx, no_inode)) {
-                                        filldir)) {
                                        done = true;
                                        break;
                                }
@@ -163,9 +162,8 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s"
                                   " inode:%u\n", size, de->de_fname,
                                                        no_inode));
-                                if (filldir(dirent, de->de_fname, size,
+                                if (!dir_emit(ctx, de->de_fname, size,
-                                      pos, no_inode, DT_UNKNOWN)
+                                      no_inode, DT_UNKNOWN)) {
-                                        < 0) {
                                        done = true;
                                        break;
                                }
@@ -173,7 +171,6 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
                }
                qnx6_put_page(page);
        }
-        filp->f_pos = pos;
        return 0;
 }
@@ -282,7 +279,7 @@ found:
 const struct file_operations qnx6_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = qnx6_readdir,
+        .iterate        = qnx6_readdir,
        .fsync          = generic_file_fsync,
 };
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3e64169ef527..fbad622841f9 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2585,7 +2585,7 @@ static int do_proc_dqstats(struct ctl_table *table, int write,
        return proc_dointvec(table, write, buffer, lenp, ppos);
 }
-static ctl_table fs_dqstats_table[] = {
+static struct ctl_table fs_dqstats_table[] = {
        {
                .procname       = "lookups",
                .data           = &dqstats.stat[DQST_LOOKUPS],
@@ -2654,7 +2654,7 @@ static ctl_table fs_dqstats_table[] = {
        { },
 };
-static ctl_table fs_table[] = {
+static struct ctl_table fs_table[] = {
        {
                .procname       = "quota",
                .mode           = 0555,
@@ -2663,7 +2663,7 @@ static ctl_table fs_table[] = {
        { },
 };
-static ctl_table sys_table[] = {
+static struct ctl_table sys_table[] = {
        {
                .procname       = "fs",
                .mode           = 0555,
diff --git a/fs/read_write.c b/fs/read_write.c
index 2cefa417be34..122a3846d9e1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -41,8 +41,19 @@ static inline int unsigned_offsets(struct file *file)
        return file->f_mode & FMODE_UNSIGNED_OFFSET;
 }
-static loff_t lseek_execute(struct file *file, struct inode *inode,
+/**
-                loff_t offset, loff_t maxsize)
+ * vfs_setpos - update the file offset for lseek
+ * @file:       file structure in question
+ * @offset:     file offset to seek to
+ * @maxsize:    maximum file size
+ *
+ * This is a low-level filesystem helper for updating the file offset to
+ * the value specified by @offset if the given offset is valid and it is
+ * not equal to the current file offset.
+ *
+ * Return the specified offset on success and -EINVAL on invalid offset.
+ */
+loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
 {
        if (offset < 0 && !unsigned_offsets(file))
                return -EINVAL;
@@ -55,6 +66,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
        }
        return offset;
 }
+EXPORT_SYMBOL(vfs_setpos);
 /**
 * generic_file_llseek_size - generic llseek implementation for regular files
@@ -76,8 +88,6 @@ loff_t
 generic_file_llseek_size(struct file *file, loff_t offset, int whence,
                loff_t maxsize, loff_t eof)
 {
-        struct inode *inode = file->f_mapping->host;
        switch (whence) {
        case SEEK_END:
                offset += eof;
@@ -97,8 +107,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
                 * like SEEK_SET.
                 */
                spin_lock(&file->f_lock);
-                offset = lseek_execute(file, inode, file->f_pos + offset,
+                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
-                                       maxsize);
                spin_unlock(&file->f_lock);
                return offset;
        case SEEK_DATA:
@@ -120,7 +129,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
                break;
        }
-        return lseek_execute(file, inode, offset, maxsize);
+        return vfs_setpos(file, offset, maxsize);
 }
 EXPORT_SYMBOL(generic_file_llseek_size);
@@ -145,6 +154,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 EXPORT_SYMBOL(generic_file_llseek);
 /**
+ * fixed_size_llseek - llseek implementation for fixed-sized devices
+ * @file:       file structure to seek on
+ * @offset:     file offset to seek to
+ * @whence:     type of seek
+ * @size:       size of the file
+ *
+ */
+loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
+{
+        switch (whence) {
+        case SEEK_SET: case SEEK_CUR: case SEEK_END:
+                return generic_file_llseek_size(file, offset, whence,
+                                                size, size);
+        default:
+                return -EINVAL;
+        }
+}
+EXPORT_SYMBOL(fixed_size_llseek);
+/**
 * noop_llseek - No Operation Performed llseek implementation
 * @file:       file structure to seek on
 * @offset:     file offset to seek to
@@ -296,7 +325,7 @@ out_putf:
 * them to something that fits in "int" so that others
 * won't have to do range checks all the time.
 */
-int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
+int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 {
        struct inode *inode;
        loff_t pos;
@@ -477,7 +506,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
        if (f.file) {
                loff_t pos = file_pos_read(f.file);
                ret = vfs_read(f.file, buf, count, &pos);
-                file_pos_write(f.file, pos);
+                if (ret >= 0)
+                        file_pos_write(f.file, pos);
                fdput(f);
        }
        return ret;
@@ -492,7 +522,8 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
        if (f.file) {
                loff_t pos = file_pos_read(f.file);
                ret = vfs_write(f.file, buf, count, &pos);
-                file_pos_write(f.file, pos);
+                if (ret >= 0)
+                        file_pos_write(f.file, pos);
                fdput(f);
        }
@@ -780,7 +811,8 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
        if (f.file) {
                loff_t pos = file_pos_read(f.file);
                ret = vfs_readv(f.file, vec, vlen, &pos);
-                file_pos_write(f.file, pos);
+                if (ret >= 0)
+                        file_pos_write(f.file, pos);
                fdput(f);
        }
@@ -799,7 +831,8 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
        if (f.file) {
                loff_t pos = file_pos_read(f.file);
                ret = vfs_writev(f.file, vec, vlen, &pos);
-                file_pos_write(f.file, pos);
+                if (ret >= 0)
+                        file_pos_write(f.file, pos);
                fdput(f);
        }
@@ -959,7 +992,8 @@ COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
                return -EBADF;
        pos = f.file->f_pos;
        ret = compat_readv(f.file, vec, vlen, &pos);
-        f.file->f_pos = pos;
+        if (ret >= 0)
+                f.file->f_pos = pos;
        fdput(f);
        return ret;
 }
@@ -1025,7 +1059,8 @@ COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
                return -EBADF;
        pos = f.file->f_pos;
        ret = compat_writev(f.file, vec, vlen, &pos);
-        f.file->f_pos = pos;
+        if (ret >= 0)
+                f.file->f_pos = pos;
        fdput(f);
        return ret;
 }
@@ -1129,7 +1164,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
        if (in.file->f_flags & O_NONBLOCK)
                fl = SPLICE_F_NONBLOCK;
 #endif
+        file_start_write(out.file);
        retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
+        file_end_write(out.file);
        if (retval > 0) {
                add_rchar(current, retval);
diff --git a/fs/readdir.c b/fs/readdir.c
index fee38e04fae4..93d71e574310 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -20,11 +20,11 @@
 #include <asm/uaccess.h>
-int vfs_readdir(struct file *file, filldir_t filler, void *buf)
+int iterate_dir(struct file *file, struct dir_context *ctx)
 {
        struct inode *inode = file_inode(file);
        int res = -ENOTDIR;
-        if (!file->f_op || !file->f_op->readdir)
+        if (!file->f_op || !file->f_op->iterate)
                goto out;
        res = security_file_permission(file, MAY_READ);
@@ -37,15 +37,16 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
        res = -ENOENT;
        if (!IS_DEADDIR(inode)) {
-                res = file->f_op->readdir(file, buf, filler);
+                ctx->pos = file->f_pos;
+                res = file->f_op->iterate(file, ctx);
+                file->f_pos = ctx->pos;
                file_accessed(file);
        }
        mutex_unlock(&inode->i_mutex);
 out:
        return res;
 }
+EXPORT_SYMBOL(iterate_dir);
-EXPORT_SYMBOL(vfs_readdir);
 /*
 * Traditional linux readdir() handling..
@@ -66,6 +67,7 @@ struct old_linux_dirent {
 };
 struct readdir_callback {
+        struct dir_context ctx;
        struct old_linux_dirent __user * dirent;
        int result;
 };
@@ -73,7 +75,7 @@ struct readdir_callback {
 static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset,
                      u64 ino, unsigned int d_type)
 {
-        struct readdir_callback * buf = (struct readdir_callback *) __buf;
+        struct readdir_callback *buf = (struct readdir_callback *) __buf;
        struct old_linux_dirent __user * dirent;
        unsigned long d_ino;
@@ -107,15 +109,15 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
 {
        int error;
        struct fd f = fdget(fd);
-        struct readdir_callback buf;
+        struct readdir_callback buf = {
+                .ctx.actor = fillonedir,
+                .dirent = dirent
+        };
        if (!f.file)
                return -EBADF;
-        buf.result = 0;
+        error = iterate_dir(f.file, &buf.ctx);
-        buf.dirent = dirent;
-        error = vfs_readdir(f.file, fillonedir, &buf);
        if (buf.result)
                error = buf.result;
@@ -137,6 +139,7 @@ struct linux_dirent {
 };
 struct getdents_callback {
+        struct dir_context ctx;
        struct linux_dirent __user * current_dir;
        struct linux_dirent __user * previous;
        int count;
@@ -191,7 +194,11 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 {
        struct fd f;
        struct linux_dirent __user * lastdirent;
-        struct getdents_callback buf;
+        struct getdents_callback buf = {
+                .ctx.actor = filldir,
+                .count = count,
+                .current_dir = dirent
+        };
        int error;
        if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -201,17 +208,12 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
        if (!f.file)
                return -EBADF;
-        buf.current_dir = dirent;
+        error = iterate_dir(f.file, &buf.ctx);
-        buf.previous = NULL;
-        buf.count = count;
-        buf.error = 0;
-        error = vfs_readdir(f.file, filldir, &buf);
        if (error >= 0)
                error = buf.error;
        lastdirent = buf.previous;
        if (lastdirent) {
-                if (put_user(f.file->f_pos, &lastdirent->d_off))
+                if (put_user(buf.ctx.pos, &lastdirent->d_off))
                        error = -EFAULT;
                else
                        error = count - buf.count;
@@ -221,6 +223,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 }
 struct getdents_callback64 {
+        struct dir_context ctx;
        struct linux_dirent64 __user * current_dir;
        struct linux_dirent64 __user * previous;
        int count;
@@ -271,7 +274,11 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 {
        struct fd f;
        struct linux_dirent64 __user * lastdirent;
-        struct getdents_callback64 buf;
+        struct getdents_callback64 buf = {
+                .ctx.actor = filldir64,
+                .count = count,
+                .current_dir = dirent
+        };
        int error;
        if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -281,17 +288,12 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
        if (!f.file)
                return -EBADF;
-        buf.current_dir = dirent;
+        error = iterate_dir(f.file, &buf.ctx);
-        buf.previous = NULL;
-        buf.count = count;
-        buf.error = 0;
-        error = vfs_readdir(f.file, filldir64, &buf);
        if (error >= 0)
                error = buf.error;
        lastdirent = buf.previous;
        if (lastdirent) {
-                typeof(lastdirent->d_off) d_off = f.file->f_pos;
+                typeof(lastdirent->d_off) d_off = buf.ctx.pos;
                if (__put_user(d_off, &lastdirent->d_off))
                        error = -EFAULT;
                else
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 6c2d136561cb..03e4ca5624d6 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -13,14 +13,14 @@
 extern const struct reiserfs_key MIN_KEY;
-static int reiserfs_readdir(struct file *, void *, filldir_t);
+static int reiserfs_readdir(struct file *, struct dir_context *);
 static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
                              int datasync);
 const struct file_operations reiserfs_dir_operations = {
        .llseek = generic_file_llseek,
        .read = generic_read_dir,
-        .readdir = reiserfs_readdir,
+        .iterate = reiserfs_readdir,
        .fsync = reiserfs_dir_fsync,
        .unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
@@ -50,18 +50,15 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 #define store_ih(where,what) copy_item_head (where, what)
-static inline bool is_privroot_deh(struct dentry *dir,
+static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
-                                   struct reiserfs_de_head *deh)
 {
-        struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
+        struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
-        return (dir == dir->d_parent && privroot->d_inode &&
+        return (privroot->d_inode &&
                deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
-int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
+int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
-                           filldir_t filldir, loff_t *pos)
 {
-        struct inode *inode = dentry->d_inode;
        struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */
        INITIALIZE_PATH(path_to_entry);
        struct buffer_head *bh;
@@ -81,7 +78,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
        /* form key for search the next directory entry using f_pos field of
           file structure */
-        make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
+        make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
        next_pos = cpu_key_k_offset(&pos_key);
        path_to_entry.reada = PATH_READA;
@@ -126,7 +123,6 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
                             entry_num++, deh++) {
                                int d_reclen;
                                char *d_name;
-                                off_t d_off;
                                ino_t d_ino;
                                if (!de_visible(deh))
@@ -155,11 +151,10 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
                                }
                                /* Ignore the .reiserfs_priv entry */
-                                if (is_privroot_deh(dentry, deh))
+                                if (is_privroot_deh(inode, deh))
                                        continue;
-                                d_off = deh_offset(deh);
+                                ctx->pos = deh_offset(deh);
-                                *pos = d_off;
                                d_ino = deh_objectid(deh);
                                if (d_reclen <= 32) {
                                        local_buf = small_buf;
@@ -187,9 +182,9 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
                                 * the write lock here for other waiters
                                 */
                                reiserfs_write_unlock(inode->i_sb);
-                                if (filldir
+                                if (!dir_emit
-                                    (dirent, local_buf, d_reclen, d_off, d_ino,
+                                    (ctx, local_buf, d_reclen, d_ino,
-                                     DT_UNKNOWN) < 0) {
+                                     DT_UNKNOWN)) {
                                        reiserfs_write_lock(inode->i_sb);
                                        if (local_buf != small_buf) {
                                                kfree(local_buf);
@@ -237,7 +232,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
        }                       /* while */
 end:
-        *pos = next_pos;
+        ctx->pos = next_pos;
        pathrelse(&path_to_entry);
        reiserfs_check_path(&path_to_entry);
 out:
@@ -245,10 +240,9 @@ out:
        return ret;
 }
-static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = file->f_path.dentry;
+        return reiserfs_readdir_inode(file_inode(file), ctx);
-        return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos);
 }
 /* compose directory item containing "." and ".." entries (entries are
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f844533792ee..0048cc16a6a8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2975,16 +2975,19 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
 }
 /* clm -- taken from fs/buffer.c:block_invalidate_page */
-static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
+static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
+                                    unsigned int length)
 {
        struct buffer_head *head, *bh, *next;
        struct inode *inode = page->mapping->host;
        unsigned int curr_off = 0;
+        unsigned int stop = offset + length;
+        int partial_page = (offset || length < PAGE_CACHE_SIZE);
        int ret = 1;
        BUG_ON(!PageLocked(page));
-        if (offset == 0)
+        if (!partial_page)
                ClearPageChecked(page);
        if (!page_has_buffers(page))
@@ -2996,6 +2999,9 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
                unsigned int next_off = curr_off + bh->b_size;
                next = bh->b_this_page;
+                if (next_off > stop)
+                        goto out;
                /*
                 * is this block fully invalidated?
                 */
@@ -3014,7 +3020,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
         * The get_block cached value has been unconditionally invalidated,
         * so real IO is not possible anymore.
         */
-        if (!offset && ret) {
+        if (!partial_page && ret) {
                ret = try_to_release_page(page, 0);
                /* maybe should BUG_ON(!ret); - neilb */
        }
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 33532f79b4f7..a958444a75fc 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -19,12 +19,13 @@
 /*
 * LOCKING:
 *
- * We rely on new Alexander Viro's super-block locking.
+ * These guys are evicted from procfs as the very first step in ->kill_sb().
 *
 */
-static int show_version(struct seq_file *m, struct super_block *sb)
+static int show_version(struct seq_file *m, void *unused)
 {
+        struct super_block *sb = m->private;
        char *format;
        if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) {
@@ -66,8 +67,9 @@ static int show_version(struct seq_file *m, struct super_block *sb)
 #define DJP( x ) le32_to_cpu( jp -> x )
 #define JF( x ) ( r -> s_journal -> x )
-static int show_super(struct seq_file *m, struct super_block *sb)
+static int show_super(struct seq_file *m, void *unused)
 {
+        struct super_block *sb = m->private;
        struct reiserfs_sb_info *r = REISERFS_SB(sb);
        seq_printf(m, "state: \t%s\n"
@@ -128,8 +130,9 @@ static int show_super(struct seq_file *m, struct super_block *sb)
        return 0;
 }
-static int show_per_level(struct seq_file *m, struct super_block *sb)
+static int show_per_level(struct seq_file *m, void *unused)
 {
+        struct super_block *sb = m->private;
        struct reiserfs_sb_info *r = REISERFS_SB(sb);
        int level;
@@ -186,8 +189,9 @@ static int show_per_level(struct seq_file *m, struct super_block *sb)
        return 0;
 }
-static int show_bitmap(struct seq_file *m, struct super_block *sb)
+static int show_bitmap(struct seq_file *m, void *unused)
 {
+        struct super_block *sb = m->private;
        struct reiserfs_sb_info *r = REISERFS_SB(sb);
        seq_printf(m, "free_block: %lu\n"
@@ -218,8 +222,9 @@ static int show_bitmap(struct seq_file *m, struct super_block *sb)
        return 0;
 }
-static int show_on_disk_super(struct seq_file *m, struct super_block *sb)
+static int show_on_disk_super(struct seq_file *m, void *unused)
 {
+        struct super_block *sb = m->private;
        struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
        struct reiserfs_super_block *rs = sb_info->s_rs;
        int hash_code = DFL(s_hash_function_code);
@@ -261,8 +266,9 @@ static int show_on_disk_super(struct seq_file *m, struct super_block *sb)
        return 0;
 }
-static int show_oidmap(struct seq_file *m, struct super_block *sb)
+static int show_oidmap(struct seq_file *m, void *unused)
 {
+        struct super_block *sb = m->private;
        struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
        struct reiserfs_super_block *rs = sb_info->s_rs;
        unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize);
@@ -291,8 +297,9 @@ static int show_oidmap(struct seq_file *m, struct super_block *sb)
        return 0;
 }
-static int show_journal(struct seq_file *m, struct super_block *sb)
+static int show_journal(struct seq_file *m, void *unused)
 {
+        struct super_block *sb = m->private;
        struct reiserfs_sb_info *r = REISERFS_SB(sb);
        struct reiserfs_super_block *rs = r->s_rs;
        struct journal_params *jp = &rs->s_v1.s_journal;
@@ -383,92 +390,24 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
        return 0;
 }
-/* iterator */
-static int test_sb(struct super_block *sb, void *data)
-{
-        return data == sb;
-}
-static int set_sb(struct super_block *sb, void *data)
-{
-        return -ENOENT;
-}
-struct reiserfs_seq_private {
-        struct super_block *sb;
-        int (*show) (struct seq_file *, struct super_block *);
-};
-static void *r_start(struct seq_file *m, loff_t * pos)
-{
-        struct reiserfs_seq_private *priv = m->private;
-        loff_t l = *pos;
-        if (l)
-                return NULL;
-        if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, priv->sb)))
-                return NULL;
-        up_write(&priv->sb->s_umount);
-        return priv->sb;
-}
-static void *r_next(struct seq_file *m, void *v, loff_t * pos)
-{
-        ++*pos;
-        if (v)
-                deactivate_super(v);
-        return NULL;
-}
-static void r_stop(struct seq_file *m, void *v)
-{
-        if (v)
-                deactivate_super(v);
-}
-static int r_show(struct seq_file *m, void *v)
-{
-        struct reiserfs_seq_private *priv = m->private;
-        return priv->show(m, v);
-}
-static const struct seq_operations r_ops = {
-        .start = r_start,
-        .next = r_next,
-        .stop = r_stop,
-        .show = r_show,
-};
 static int r_open(struct inode *inode, struct file *file)
 {
-        struct reiserfs_seq_private *priv;
+        return single_open(file, PDE_DATA(inode), 
-        int ret = seq_open_private(file, &r_ops,
+                                proc_get_parent_data(inode));
-                                   sizeof(struct reiserfs_seq_private));
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                priv = m->private;
-                priv->sb = proc_get_parent_data(inode);
-                priv->show = PDE_DATA(inode);
-        }
-        return ret;
 }
 static const struct file_operations r_file_operations = {
        .open = r_open,
        .read = seq_read,
        .llseek = seq_lseek,
-        .release = seq_release_private,
+        .release = single_release,
-        .owner = THIS_MODULE,
 };
 static struct proc_dir_entry *proc_info_root = NULL;
 static const char proc_info_root_name[] = "fs/reiserfs";
 static void add_file(struct super_block *sb, char *name,
-                     int (*func) (struct seq_file *, struct super_block *))
+                     int (*func) (struct seq_file *, void *))
 {
        proc_create_data(name, 0, REISERFS_SB(sb)->procdir,
                         &r_file_operations, func);
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 157e474ab303..3df5ce6c724d 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2709,7 +2709,7 @@ extern const struct inode_operations reiserfs_dir_inode_operations;
 extern const struct inode_operations reiserfs_symlink_inode_operations;
 extern const struct inode_operations reiserfs_special_inode_operations;
 extern const struct file_operations reiserfs_dir_operations;
-int reiserfs_readdir_dentry(struct dentry *, void *, filldir_t, loff_t *);
+int reiserfs_readdir_inode(struct inode *, struct dir_context *);
 /* tail_conversion.c */
 int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f8a23c3078f8..e2e202a07b31 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -499,6 +499,7 @@ int remove_save_link(struct inode *inode, int truncate)
 static void reiserfs_kill_sb(struct super_block *s)
 {
        if (REISERFS_SB(s)) {
+                reiserfs_proc_info_done(s);
                /*
                 * Force any pending inode evictions to occur now. Any
                 * inodes to be removed that have extended attributes
@@ -554,8 +555,6 @@ static void reiserfs_put_super(struct super_block *s)
                                 REISERFS_SB(s)->reserved_blocks);
        }
-        reiserfs_proc_info_done(s);
        reiserfs_write_unlock(s);
        mutex_destroy(&REISERFS_SB(s)->lock);
        kfree(s->s_fs_info);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 821bcf70e467..c69cdd749f09 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -171,6 +171,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
 * modifying extended attributes. This includes operations such as permissions
 * or ownership changes, object deletions, etc. */
 struct reiserfs_dentry_buf {
+        struct dir_context ctx;
        struct dentry *xadir;
        int count;
        struct dentry *dentries[8];
@@ -223,9 +224,8 @@ static int reiserfs_for_each_xattr(struct inode *inode,
 {
        struct dentry *dir;
        int i, err = 0;
-        loff_t pos = 0;
        struct reiserfs_dentry_buf buf = {
-                .count = 0,
+                .ctx.actor = fill_with_dentries,
        };
        /* Skip out, an xattr has no xattrs associated with it */
@@ -249,29 +249,27 @@ static int reiserfs_for_each_xattr(struct inode *inode,
        reiserfs_write_lock(inode->i_sb);
        buf.xadir = dir;
-        err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
+        while (1) {
-        while ((err == 0 || err == -ENOSPC) && buf.count) {
+                err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
-                err = 0;
+                if (err)
+                        break;
-                for (i = 0; i < buf.count && buf.dentries[i]; i++) {
+                if (!buf.count)
-                        int lerr = 0;
+                        break;
+                for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
                        struct dentry *dentry = buf.dentries[i];
-                        if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode))
+                        if (!S_ISDIR(dentry->d_inode->i_mode))
-                                lerr = action(dentry, data);
+                                err = action(dentry, data);
                        dput(dentry);
                        buf.dentries[i] = NULL;
-                        err = lerr ?: err;
                }
+                if (err)
+                        break;
                buf.count = 0;
-                if (!err)
-                        err = reiserfs_readdir_dentry(dir, &buf,
-                                                      fill_with_dentries, &pos);
        }
        mutex_unlock(&dir->d_inode->i_mutex);
-        /* Clean up after a failed readdir */
        cleanup_dentry_buf(&buf);
        if (!err) {
@@ -800,6 +798,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
 }
 struct listxattr_buf {
+        struct dir_context ctx;
        size_t size;
        size_t pos;
        char *buf;
@@ -845,8 +844,8 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 {
        struct dentry *dir;
        int err = 0;
-        loff_t pos = 0;
        struct listxattr_buf buf = {
+                .ctx.actor = listxattr_filler,
                .dentry = dentry,
                .buf = buffer,
                .size = buffer ? size : 0,
@@ -868,7 +867,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
        }
        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
-        err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos);
+        err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
        mutex_unlock(&dir->d_inode->i_mutex);
        if (!err)
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 15cbc41ee365..ff1d3d42e72a 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -145,19 +145,18 @@ static const struct address_space_operations romfs_aops = {
 /*
 * read the entries from a directory
 */
-static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int romfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct inode *i = file_inode(filp);
+        struct inode *i = file_inode(file);
        struct romfs_inode ri;
        unsigned long offset, maxoff;
        int j, ino, nextfh;
-        int stored = 0;
        char fsname[ROMFS_MAXFN];       /* XXX dynamic? */
        int ret;
        maxoff = romfs_maxsize(i->i_sb);
-        offset = filp->f_pos;
+        offset = ctx->pos;
        if (!offset) {
                offset = i->i_ino & ROMFH_MASK;
                ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -170,10 +169,10 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        for (;;) {
                if (!offset || offset >= maxoff) {
                        offset = maxoff;
-                        filp->f_pos = offset;
+                        ctx->pos = offset;
                        goto out;
                }
-                filp->f_pos = offset;
+                ctx->pos = offset;
                /* Fetch inode info */
                ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -194,16 +193,14 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                nextfh = be32_to_cpu(ri.next);
                if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
                        ino = be32_to_cpu(ri.spec);
-                if (filldir(dirent, fsname, j, offset, ino,
+                if (!dir_emit(ctx, fsname, j, ino,
-                            romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+                            romfs_dtype_table[nextfh & ROMFH_TYPE]))
                        goto out;
-                stored++;
                offset = nextfh & ROMFH_MASK;
        }
 out:
-        return stored;
+        return 0;
 }
 /*
@@ -281,7 +278,7 @@ error:
 static const struct file_operations romfs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = romfs_readdir,
+        .iterate        = romfs_readdir,
        .llseek         = default_llseek,
 };
diff --git a/fs/select.c b/fs/select.c
index 8c1c96c27062..35d4adc749d9 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -27,6 +27,8 @@
 #include <linux/rcupdate.h>
 #include <linux/hrtimer.h>
 #include <linux/sched/rt.h>
+#include <linux/freezer.h>
+#include <net/busy_poll.h>
 #include <asm/uaccess.h>
@@ -236,7 +238,8 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
        set_current_state(state);
        if (!pwq->triggered)
-                rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+                rc = freezable_schedule_hrtimeout_range(expires, slack,
+                                                        HRTIMER_MODE_ABS);
        __set_current_state(TASK_RUNNING);
        /*
@@ -384,9 +387,10 @@ get_max:
 #define POLLEX_SET (POLLPRI)
 static inline void wait_key_set(poll_table *wait, unsigned long in,
-                                unsigned long out, unsigned long bit)
+                                unsigned long out, unsigned long bit,
+                                unsigned int ll_flag)
 {
-        wait->_key = POLLEX_SET;
+        wait->_key = POLLEX_SET | ll_flag;
        if (in & bit)
                wait->_key |= POLLIN_SET;
        if (out & bit)
@@ -400,6 +404,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
        poll_table *wait;
        int retval, i, timed_out = 0;
        unsigned long slack = 0;
+        unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
+        unsigned long busy_end = 0;
        rcu_read_lock();
        retval = max_select_fd(n, fds);
@@ -422,6 +428,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
        retval = 0;
        for (;;) {
                unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
+                bool can_busy_loop = false;
                inp = fds->in; outp = fds->out; exp = fds->ex;
                rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -449,7 +456,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
                                        f_op = f.file->f_op;
                                        mask = DEFAULT_POLLMASK;
                                        if (f_op && f_op->poll) {
-                                                wait_key_set(wait, in, out, bit);
+                                                wait_key_set(wait, in, out,
+                                                             bit, busy_flag);
                                                mask = (*f_op->poll)(f.file, wait);
                                        }
                                        fdput(f);
@@ -468,6 +476,18 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
                                                retval++;
                                                wait->_qproc = NULL;
                                        }
+                                        /* got something, stop busy polling */
+                                        if (retval) {
+                                                can_busy_loop = false;
+                                                busy_flag = 0;
+                                        /*
+                                         * only remember a returned
+                                         * POLL_BUSY_LOOP if we asked for it
+                                         */
+                                        } else if (busy_flag & mask)
+                                                can_busy_loop = true;
                                }
                        }
                        if (res_in)
@@ -486,6 +506,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
                        break;
                }
+                /* only if found POLL_BUSY_LOOP sockets && not out of time */
+                if (can_busy_loop && !need_resched()) {
+                        if (!busy_end) {
+                                busy_end = busy_loop_end_time();
+                                continue;
+                        }
+                        if (!busy_loop_timeout(busy_end))
+                                continue;
+                }
+                busy_flag = 0;
                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
@@ -717,7 +748,9 @@ struct poll_list {
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 * if pwait->_qproc is non-NULL.
 */
-static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
+static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
+                                     bool *can_busy_poll,
+                                     unsigned int busy_flag)
 {
        unsigned int mask;
        int fd;
@@ -731,7 +764,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
                        mask = DEFAULT_POLLMASK;
                        if (f.file->f_op && f.file->f_op->poll) {
                                pwait->_key = pollfd->events|POLLERR|POLLHUP;
+                                pwait->_key |= busy_flag;
                                mask = f.file->f_op->poll(f.file, pwait);
+                                if (mask & busy_flag)
+                                        *can_busy_poll = true;
                        }
                        /* Mask out unneeded events. */
                        mask &= pollfd->events | POLLERR | POLLHUP;
@@ -750,6 +786,8 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
        ktime_t expire, *to = NULL;
        int timed_out = 0, count = 0;
        unsigned long slack = 0;
+        unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
+        unsigned long busy_end = 0;
        /* Optimise the no-wait case */
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -762,6 +800,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
        for (;;) {
                struct poll_list *walk;
+                bool can_busy_loop = false;
                for (walk = list; walk != NULL; walk = walk->next) {
                        struct pollfd * pfd, * pfd_end;
@@ -776,9 +815,13 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
                                 * this. They'll get immediately deregistered
                                 * when we break out and return.
                                 */
-                                if (do_pollfd(pfd, pt)) {
+                                if (do_pollfd(pfd, pt, &can_busy_loop,
+                                              busy_flag)) {
                                        count++;
                                        pt->_qproc = NULL;
+                                        /* found something, stop busy polling */
+                                        busy_flag = 0;
+                                        can_busy_loop = false;
                                }
                        }
                }
@@ -795,6 +838,17 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
                if (count || timed_out)
                        break;
+                /* only if found POLL_BUSY_LOOP sockets && not out of time */
+                if (can_busy_loop && !need_resched()) {
+                        if (!busy_end) {
+                                busy_end = busy_loop_end_time();
+                                continue;
+                        }
+                        if (!busy_loop_timeout(busy_end))
+                                continue;
+                }
+                busy_flag = 0;
                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 774c1eb7f1c9..3135c2525c76 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -921,3 +921,57 @@ struct hlist_node *seq_hlist_next_rcu(void *v,
                return rcu_dereference(node->next);
 }
 EXPORT_SYMBOL(seq_hlist_next_rcu);
+/**
+ * seq_hlist_start_precpu - start an iteration of a percpu hlist array
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu:  pointer to cpu "cursor"
+ * @pos:  start position of sequence
+ *
+ * Called at seq_file->op->start().
+ */
+struct hlist_node *
+seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos)
+{
+        struct hlist_node *node;
+        for_each_possible_cpu(*cpu) {
+                hlist_for_each(node, per_cpu_ptr(head, *cpu)) {
+                        if (pos-- == 0)
+                                return node;
+                }
+        }
+        return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start_percpu);
+/**
+ * seq_hlist_next_percpu - move to the next position of the percpu hlist array
+ * @v:    pointer to current hlist_node
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu:  pointer to cpu "cursor"
+ * @pos:  start position of sequence
+ *
+ * Called at seq_file->op->next().
+ */
+struct hlist_node *
+seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
+                        int *cpu, loff_t *pos)
+{
+        struct hlist_node *node = v;
+        ++*pos;
+        if (node->next)
+                return node->next;
+        for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids;
+             *cpu = cpumask_next(*cpu, cpu_possible_mask)) {
+                struct hlist_head *bucket = per_cpu_ptr(head, *cpu);
+                if (!hlist_empty(bucket))
+                        return bucket->first;
+        }
+        return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_next_percpu);
diff --git a/fs/splice.c b/fs/splice.c
index d37431dd60a1..3b7ee656f3aa 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1098,27 +1098,13 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 {
        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
                                loff_t *, size_t, unsigned int);
-        int ret;
-        if (unlikely(!(out->f_mode & FMODE_WRITE)))
-                return -EBADF;
-        if (unlikely(out->f_flags & O_APPEND))
-                return -EINVAL;
-        ret = rw_verify_area(WRITE, out, ppos, len);
-        if (unlikely(ret < 0))
-                return ret;
        if (out->f_op && out->f_op->splice_write)
                splice_write = out->f_op->splice_write;
        else
                splice_write = default_file_splice_write;
-        file_start_write(out);
+        return splice_write(pipe, out, ppos, len, flags);
-        ret = splice_write(pipe, out, ppos, len, flags);
-        file_end_write(out);
-        return ret;
 }
 /*
@@ -1307,6 +1293,16 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
        };
        long ret;
+        if (unlikely(!(out->f_mode & FMODE_WRITE)))
+                return -EBADF;
+        if (unlikely(out->f_flags & O_APPEND))
+                return -EINVAL;
+        ret = rw_verify_area(WRITE, out, opos, len);
+        if (unlikely(ret < 0))
+                return ret;
        ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
        if (ret > 0)
                *ppos = sd.pos;
@@ -1362,7 +1358,19 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                        offset = out->f_pos;
                }
+                if (unlikely(!(out->f_mode & FMODE_WRITE)))
+                        return -EBADF;
+                if (unlikely(out->f_flags & O_APPEND))
+                        return -EINVAL;
+                ret = rw_verify_area(WRITE, out, &offset, len);
+                if (unlikely(ret < 0))
+                        return ret;
+                file_start_write(out);
                ret = do_splice_from(ipipe, out, &offset, len, flags);
+                file_end_write(out);
                if (!off_out)
                        out->f_pos = offset;
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 57dc70ebbb19..f7f527bf8c10 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -100,7 +100,7 @@ static int get_dir_index_using_offset(struct super_block *sb,
 }
-static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int squashfs_readdir(struct file *file, struct dir_context *ctx)
 {
        struct inode *inode = file_inode(file);
        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
@@ -127,11 +127,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
         * It also means that the external f_pos is offset by 3 from the
         * on-disk directory f_pos.
         */
-        while (file->f_pos < 3) {
+        while (ctx->pos < 3) {
                char *name;
                int i_ino;
-                if (file->f_pos == 0) {
+                if (ctx->pos == 0) {
                        name = ".";
                        size = 1;
                        i_ino = inode->i_ino;
@@ -141,24 +141,18 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
                        i_ino = squashfs_i(inode)->parent;
                }
-                TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
+                if (!dir_emit(ctx, name, size, i_ino,
-                                dirent, name, size, file->f_pos, i_ino,
+                                squashfs_filetype_table[1]))
-                                squashfs_filetype_table[1]);
-                if (filldir(dirent, name, size, file->f_pos, i_ino,
-                                squashfs_filetype_table[1]) < 0) {
-                                TRACE("Filldir returned less than 0\n");
                        goto finish;
-                }
-                file->f_pos += size;
+                ctx->pos += size;
        }
        length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
                                squashfs_i(inode)->dir_idx_start,
                                squashfs_i(inode)->dir_idx_offset,
                                squashfs_i(inode)->dir_idx_cnt,
-                                file->f_pos);
+                                ctx->pos);
        while (length < i_size_read(inode)) {
                /*
@@ -198,7 +192,7 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
                        length += sizeof(*dire) + size;
-                        if (file->f_pos >= length)
+                        if (ctx->pos >= length)
                                continue;
                        dire->name[size] = '\0';
@@ -206,22 +200,12 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
                                ((short) le16_to_cpu(dire->inode_number));
                        type = le16_to_cpu(dire->type);
-                        TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
+                        if (!dir_emit(ctx, dire->name, size,
-                                        "\n", dirent, dire->name, size,
-                                        file->f_pos,
-                                        le32_to_cpu(dirh.start_block),
-                                        le16_to_cpu(dire->offset),
-                                        inode_number,
-                                        squashfs_filetype_table[type]);
-                        if (filldir(dirent, dire->name, size, file->f_pos,
                                        inode_number,
-                                        squashfs_filetype_table[type]) < 0) {
+                                        squashfs_filetype_table[type]))
-                                TRACE("Filldir returned less than 0\n");
                                goto finish;
-                        }
-                        file->f_pos = length;
+                        ctx->pos = length;
                }
        }
@@ -238,6 +222,6 @@ failed_read:
 const struct file_operations squashfs_dir_ops = {
        .read = generic_read_dir,
-        .readdir = squashfs_readdir,
+        .iterate = squashfs_readdir,
        .llseek = default_llseek,
 };
diff --git a/fs/super.c b/fs/super.c
index 7465d4364208..68307c029228 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -336,19 +336,19 @@ EXPORT_SYMBOL(deactivate_super);
 *      and want to turn it into a full-blown active reference.  grab_super()
 *      is called with sb_lock held and drops it.  Returns 1 in case of
 *      success, 0 if we had failed (superblock contents was already dead or
- *      dying when grab_super() had been called).
+ *      dying when grab_super() had been called).  Note that this is only
+ *      called for superblocks not in rundown mode (== ones still on ->fs_supers
+ *      of their type), so increment of ->s_count is OK here.
 */
 static int grab_super(struct super_block *s) __releases(sb_lock)
 {
-        if (atomic_inc_not_zero(&s->s_active)) {
-                spin_unlock(&sb_lock);
-                return 1;
-        }
-        /* it's going away */
        s->s_count++;
        spin_unlock(&sb_lock);
-        /* wait for it to die */
        down_write(&s->s_umount);
+        if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) {
+                put_super(s);
+                return 1;
+        }
        up_write(&s->s_umount);
        put_super(s);
        return 0;
@@ -463,11 +463,6 @@ retry:
                                destroy_super(s);
                                s = NULL;
                        }
-                        down_write(&old->s_umount);
-                        if (unlikely(!(old->s_flags & MS_BORN))) {
-                                deactivate_locked_super(old);
-                                goto retry;
-                        }
                        return old;
                }
        }
@@ -660,10 +655,10 @@ restart:
                if (hlist_unhashed(&sb->s_instances))
                        continue;
                if (sb->s_bdev == bdev) {
-                        if (grab_super(sb)) /* drops sb_lock */
+                        if (!grab_super(sb))
-                                return sb;
-                        else
                                goto restart;
+                        up_write(&sb->s_umount);
+                        return sb;
                }
        }
        spin_unlock(&sb_lock);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e8e0e71b29d5..e068e744dbdd 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -74,7 +74,7 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left,
 }
 /**
- *      sysfs_link_subling - link sysfs_dirent into sibling rbtree
+ *      sysfs_link_sibling - link sysfs_dirent into sibling rbtree
 *      @sd: sysfs_dirent of interest
 *
 *      Link @sd into its sibling rbtree which starts from
@@ -998,68 +998,38 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
        return pos;
 }
-static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 {
-        struct dentry *dentry = filp->f_path.dentry;
+        struct dentry *dentry = file->f_path.dentry;
        struct sysfs_dirent * parent_sd = dentry->d_fsdata;
-        struct sysfs_dirent *pos = filp->private_data;
+        struct sysfs_dirent *pos = file->private_data;
        enum kobj_ns_type type;
        const void *ns;
-        ino_t ino;
-        loff_t off;
        type = sysfs_ns_type(parent_sd);
        ns = sysfs_info(dentry->d_sb)->ns[type];
-        if (filp->f_pos == 0) {
+        if (!dir_emit_dots(file, ctx))
-                ino = parent_sd->s_ino;
+                return 0;
-                if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
-                        filp->f_pos++;
-                else
-                        return 0;
-        }
-        if (filp->f_pos == 1) {
-                if (parent_sd->s_parent)
-                        ino = parent_sd->s_parent->s_ino;
-                else
-                        ino = parent_sd->s_ino;
-                if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
-                        filp->f_pos++;
-                else
-                        return 0;
-        }
        mutex_lock(&sysfs_mutex);
-        off = filp->f_pos;
+        for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
-        for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
             pos;
-             pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
+             pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
-                const char * name;
+                const char *name = pos->s_name;
-                unsigned int type;
+                unsigned int type = dt_type(pos);
-                int len, ret;
+                int len = strlen(name);
+                ino_t ino = pos->s_ino;
-                name = pos->s_name;
+                ctx->pos = pos->s_hash;
-                len = strlen(name);
+                file->private_data = sysfs_get(pos);
-                ino = pos->s_ino;
-                type = dt_type(pos);
-                off = filp->f_pos = pos->s_hash;
-                filp->private_data = sysfs_get(pos);
                mutex_unlock(&sysfs_mutex);
-                ret = filldir(dirent, name, len, off, ino, type);
+                if (!dir_emit(ctx, name, len, ino, type))
+                        return 0;
                mutex_lock(&sysfs_mutex);
-                if (ret < 0)
-                        break;
        }
        mutex_unlock(&sysfs_mutex);
+        file->private_data = NULL;
-        /* don't reference last entry if its refcount is dropped */
+        ctx->pos = INT_MAX;
-        if (!pos) {
-                filp->private_data = NULL;
-                /* EOF and not changed as 0 or 1 in read/write path */
-                if (off == filp->f_pos && off > 1)
-                        filp->f_pos = INT_MAX;
-        }
        return 0;
 }
@@ -1077,7 +1047,7 @@ static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
 const struct file_operations sysfs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = sysfs_readdir,
+        .iterate        = sysfs_readdir,
        .release        = sysfs_dir_release,
        .llseek         = sysfs_dir_llseek,
 };
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 602f56db0442..d2bb7ed8fa74 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -449,10 +449,12 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
        spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
-        od = sd->s_attr.open;
+        if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
-        if (od) {
+                od = sd->s_attr.open;
-                atomic_inc(&od->event);
+                if (od) {
-                wake_up_interruptible(&od->poll);
+                        atomic_inc(&od->event);
+                        wake_up_interruptible(&od->poll);
+                }
        }
        spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index aec3d5c98c94..09a1a25cd145 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -20,38 +20,64 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
                         const struct attribute_group *grp)
 {
        struct attribute *const* attr;
-        int i;
+        struct bin_attribute *const* bin_attr;
-        for (i = 0, attr = grp->attrs; *attr; i++, attr++)
+        if (grp->attrs)
-                sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+                for (attr = grp->attrs; *attr; attr++)
+                        sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+        if (grp->bin_attrs)
+                for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
+                        sysfs_remove_bin_file(kobj, *bin_attr);
 }
 static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
                        const struct attribute_group *grp, int update)
 {
        struct attribute *const* attr;
+        struct bin_attribute *const* bin_attr;
        int error = 0, i;
-        for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
+        if (grp->attrs) {
-                umode_t mode = 0;
+                for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
+                        umode_t mode = 0;
+                        /*
+                         * In update mode, we're changing the permissions or
+                         * visibility.  Do this by first removing then
+                         * re-adding (if required) the file.
+                         */
+                        if (update)
+                                sysfs_hash_and_remove(dir_sd, NULL,
+                                                      (*attr)->name);
+                        if (grp->is_visible) {
+                                mode = grp->is_visible(kobj, *attr, i);
+                                if (!mode)
+                                        continue;
+                        }
+                        error = sysfs_add_file_mode(dir_sd, *attr,
+                                                    SYSFS_KOBJ_ATTR,
+                                                    (*attr)->mode | mode);
+                        if (unlikely(error))
+                                break;
+                }
+                if (error) {
+                        remove_files(dir_sd, kobj, grp);
+                        goto exit;
+                }
+        }
-                /* in update mode, we're changing the permissions or
+        if (grp->bin_attrs) {
-                 * visibility.  Do this by first removing then
+                for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
-                 * re-adding (if required) the file */
+                        if (update)
-                if (update)
+                                sysfs_remove_bin_file(kobj, *bin_attr);
-                        sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+                        error = sysfs_create_bin_file(kobj, *bin_attr);
-                if (grp->is_visible) {
+                        if (error)
-                        mode = grp->is_visible(kobj, *attr, i);
+                                break;
-                        if (!mode)
-                                continue;
                }
-                error = sysfs_add_file_mode(dir_sd, *attr, SYSFS_KOBJ_ATTR,
+                if (error)
-                                            (*attr)->mode | mode);
+                        remove_files(dir_sd, kobj, grp);
-                if (unlikely(error))
-                        break;
        }
-        if (error)
+exit:
-                remove_files(dir_sd, kobj, grp);
        return error;
 }
@@ -67,8 +93,8 @@ static int internal_create_group(struct kobject *kobj, int update,
        /* Updates may happen before the object has been instantiated */
        if (unlikely(update && !kobj->sd))
                return -EINVAL;
-        if (!grp->attrs) {
+        if (!grp->attrs && !grp->bin_attrs) {
-                WARN(1, "sysfs: attrs not set by subsystem for group: %s/%s\n",
+                WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n",
                        kobj->name, grp->name ? "" : grp->name);
                return -EINVAL;
        }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0ce3ccf7f401..3e2837a633ed 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -24,8 +24,6 @@
 #include <linux/security.h>
 #include "sysfs.h"
-extern struct super_block * sysfs_sb;
 static const struct address_space_operations sysfs_aops = {
        .readpage       = simple_readpage,
        .write_begin    = simple_write_begin,
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 3799e8dac3eb..d42291d08215 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -18,12 +18,12 @@
 #include <linux/swap.h>
 #include "sysv.h"
-static int sysv_readdir(struct file *, void *, filldir_t);
+static int sysv_readdir(struct file *, struct dir_context *);
 const struct file_operations sysv_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
-        .readdir        = sysv_readdir,
+        .iterate        = sysv_readdir,
        .fsync          = generic_file_fsync,
 };
@@ -65,18 +65,21 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
        return page;
 }
-static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int sysv_readdir(struct file *file, struct dir_context *ctx)
 {
-        unsigned long pos = filp->f_pos;
+        unsigned long pos = ctx->pos;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
-        unsigned offset = pos & ~PAGE_CACHE_MASK;
-        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned long npages = dir_pages(inode);
+        unsigned offset;
+        unsigned long n;
-        pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
+        ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
        if (pos >= inode->i_size)
-                goto done;
+                return 0;
+        offset = pos & ~PAGE_CACHE_MASK;
+        n = pos >> PAGE_CACHE_SHIFT;
        for ( ; n < npages; n++, offset = 0) {
                char *kaddr, *limit;
@@ -88,29 +91,21 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
                kaddr = (char *)page_address(page);
                de = (struct sysv_dir_entry *)(kaddr+offset);
                limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE;
-                for ( ;(char*)de <= limit; de++) {
+                for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
                        char *name = de->name;
-                        int over;
                        if (!de->inode)
                                continue;
-                        offset = (char *)de - kaddr;
+                        if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
-                        over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN),
-                                        ((loff_t)n<<PAGE_CACHE_SHIFT) | offset,
                                        fs16_to_cpu(SYSV_SB(sb), de->inode),
-                                        DT_UNKNOWN);
+                                        DT_UNKNOWN)) {
-                        if (over) {
                                dir_put_page(page);
-                                goto done;
+                                return 0;
                        }
                }
                dir_put_page(page);
        }
-done:
-        filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
        return 0;
 }
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 1c0d5f264767..731b2bbcaab3 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,8 +27,7 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
        return err;
 }
-static int sysv_hash(const struct dentry *dentry, const struct inode *inode,
+static int sysv_hash(const struct dentry *dentry, struct qstr *qstr)
-                struct qstr *qstr)
 {
        /* Truncate the name in place, avoids having to define a compare
           function. */
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 32b644f03690..929312180dd0 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -8,6 +8,7 @@
 *
 */
+#include <linux/alarmtimer.h>
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/init.h>
@@ -26,7 +27,10 @@
 #include <linux/rcupdate.h>
 struct timerfd_ctx {
-        struct hrtimer tmr;
+        union {
+                struct hrtimer tmr;
+                struct alarm alarm;
+        } t;
        ktime_t tintv;
        ktime_t moffs;
        wait_queue_head_t wqh;
@@ -41,14 +45,19 @@ struct timerfd_ctx {
 static LIST_HEAD(cancel_list);
 static DEFINE_SPINLOCK(cancel_lock);
+static inline bool isalarm(struct timerfd_ctx *ctx)
+{
+        return ctx->clockid == CLOCK_REALTIME_ALARM ||
+                ctx->clockid == CLOCK_BOOTTIME_ALARM;
+}
 /*
 * This gets called when the timer event triggers. We set the "expired"
 * flag, but we do not re-arm the timer (in case it's necessary,
 * tintv.tv64 != 0) until the timer is accessed.
 */
-static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
+static void timerfd_triggered(struct timerfd_ctx *ctx)
 {
-        struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr);
        unsigned long flags;
        spin_lock_irqsave(&ctx->wqh.lock, flags);
@@ -56,10 +65,25 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
        ctx->ticks++;
        wake_up_locked(&ctx->wqh);
        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+}
+static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
+{
+        struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx,
+                                               t.tmr);
+        timerfd_triggered(ctx);
        return HRTIMER_NORESTART;
 }
+static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
+        ktime_t now)
+{
+        struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx,
+                                               t.alarm);
+        timerfd_triggered(ctx);
+        return ALARMTIMER_NORESTART;
+}
 /*
 * Called when the clock was set to cancel the timers in the cancel
 * list. This will wake up processes waiting on these timers. The
@@ -107,8 +131,9 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx)
 static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
 {
-        if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) &&
+        if ((ctx->clockid == CLOCK_REALTIME ||
-            (flags & TFD_TIMER_CANCEL_ON_SET)) {
+             ctx->clockid == CLOCK_REALTIME_ALARM) &&
+            (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
                if (!ctx->might_cancel) {
                        ctx->might_cancel = true;
                        spin_lock(&cancel_lock);
@@ -124,7 +149,11 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
 {
        ktime_t remaining;
-        remaining = hrtimer_expires_remaining(&ctx->tmr);
+        if (isalarm(ctx))
+                remaining = alarm_expires_remaining(&ctx->t.alarm);
+        else
+                remaining = hrtimer_expires_remaining(&ctx->t.tmr);
        return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
 }
@@ -142,11 +171,28 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
        ctx->expired = 0;
        ctx->ticks = 0;
        ctx->tintv = timespec_to_ktime(ktmr->it_interval);
-        hrtimer_init(&ctx->tmr, clockid, htmode);
-        hrtimer_set_expires(&ctx->tmr, texp);
+        if (isalarm(ctx)) {
-        ctx->tmr.function = timerfd_tmrproc;
+                alarm_init(&ctx->t.alarm,
+                           ctx->clockid == CLOCK_REALTIME_ALARM ?
+                           ALARM_REALTIME : ALARM_BOOTTIME,
+                           timerfd_alarmproc);
+        } else {
+                hrtimer_init(&ctx->t.tmr, clockid, htmode);
+                hrtimer_set_expires(&ctx->t.tmr, texp);
+                ctx->t.tmr.function = timerfd_tmrproc;
+        }
        if (texp.tv64 != 0) {
-                hrtimer_start(&ctx->tmr, texp, htmode);
+                if (isalarm(ctx)) {
+                        if (flags & TFD_TIMER_ABSTIME)
+                                alarm_start(&ctx->t.alarm, texp);
+                        else
+                                alarm_start_relative(&ctx->t.alarm, texp);
+                } else {
+                        hrtimer_start(&ctx->t.tmr, texp, htmode);
+                }
                if (timerfd_canceled(ctx))
                        return -ECANCELED;
        }
@@ -158,7 +204,11 @@ static int timerfd_release(struct inode *inode, struct file *file)
        struct timerfd_ctx *ctx = file->private_data;
        timerfd_remove_cancel(ctx);
-        hrtimer_cancel(&ctx->tmr);
+        if (isalarm(ctx))
+                alarm_cancel(&ctx->t.alarm);
+        else
+                hrtimer_cancel(&ctx->t.tmr);
        kfree_rcu(ctx, rcu);
        return 0;
 }
@@ -215,9 +265,15 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
                         * callback to avoid DoS attacks specifying a very
                         * short timer period.
                         */
-                        ticks += hrtimer_forward_now(&ctx->tmr,
+                        if (isalarm(ctx)) {
-                                                     ctx->tintv) - 1;
+                                ticks += alarm_forward_now(
-                        hrtimer_restart(&ctx->tmr);
+                                        &ctx->t.alarm, ctx->tintv) - 1;
+                                alarm_restart(&ctx->t.alarm);
+                        } else {
+                                ticks += hrtimer_forward_now(&ctx->t.tmr,
+                                                             ctx->tintv) - 1;
+                                hrtimer_restart(&ctx->t.tmr);
+                        }
                }
                ctx->expired = 0;
                ctx->ticks = 0;
@@ -259,7 +315,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
        if ((flags & ~TFD_CREATE_FLAGS) ||
            (clockid != CLOCK_MONOTONIC &&
-             clockid != CLOCK_REALTIME))
+             clockid != CLOCK_REALTIME &&
+             clockid != CLOCK_REALTIME_ALARM &&
+             clockid != CLOCK_BOOTTIME_ALARM))
                return -EINVAL;
        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -268,7 +326,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
        init_waitqueue_head(&ctx->wqh);
        ctx->clockid = clockid;
-        hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
+        if (isalarm(ctx))
+                alarm_init(&ctx->t.alarm,
+                           ctx->clockid == CLOCK_REALTIME_ALARM ?
+                           ALARM_REALTIME : ALARM_BOOTTIME,
+                           timerfd_alarmproc);
+        else
+                hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
        ctx->moffs = ktime_get_monotonic_offset();
        ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
@@ -305,8 +371,14 @@ static int do_timerfd_settime(int ufd, int flags,
         */
        for (;;) {
                spin_lock_irq(&ctx->wqh.lock);
-                if (hrtimer_try_to_cancel(&ctx->tmr) >= 0)
-                        break;
+                if (isalarm(ctx)) {
+                        if (alarm_try_to_cancel(&ctx->t.alarm) >= 0)
+                                break;
+                } else {
+                        if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0)
+                                break;
+                }
                spin_unlock_irq(&ctx->wqh.lock);
                cpu_relax();
        }
@@ -317,8 +389,12 @@ static int do_timerfd_settime(int ufd, int flags,
         * We do not update "ticks" and "expired" since the timer will be
         * re-programmed again in the following timerfd_setup() call.
         */
-        if (ctx->expired && ctx->tintv.tv64)
+        if (ctx->expired && ctx->tintv.tv64) {
-                hrtimer_forward_now(&ctx->tmr, ctx->tintv);
+                if (isalarm(ctx))
+                        alarm_forward_now(&ctx->t.alarm, ctx->tintv);
+                else
+                        hrtimer_forward_now(&ctx->t.tmr, ctx->tintv);
+        }
        old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
        old->it_interval = ktime_to_timespec(ctx->tintv);
@@ -345,9 +421,18 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t)
        spin_lock_irq(&ctx->wqh.lock);
        if (ctx->expired && ctx->tintv.tv64) {
                ctx->expired = 0;
-                ctx->ticks +=
-                        hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1;
+                if (isalarm(ctx)) {
-                hrtimer_restart(&ctx->tmr);
+                        ctx->ticks +=
+                                alarm_forward_now(
+                                        &ctx->t.alarm, ctx->tintv) - 1;
+                        alarm_restart(&ctx->t.alarm);
+                } else {
+                        ctx->ticks +=
+                                hrtimer_forward_now(&ctx->t.tmr, ctx->tintv)
+                                - 1;
+                        hrtimer_restart(&ctx->t.tmr);
+                }
        }
        t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
        t->it_interval = ktime_to_timespec(ctx->tintv);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 605af512aec2..6b4947f75af7 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -346,19 +346,18 @@ static unsigned int vfs_dent_type(uint8_t type)
 * This means that UBIFS cannot support NFS which requires full
 * 'seekdir()'/'telldir()' support.
 */
-static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 {
-        int err, over = 0;
+        int err;
-        loff_t pos = file->f_pos;
        struct qstr nm;
        union ubifs_key key;
        struct ubifs_dent_node *dent;
        struct inode *dir = file_inode(file);
        struct ubifs_info *c = dir->i_sb->s_fs_info;
-        dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, pos);
+        dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
-        if (pos > UBIFS_S_KEY_HASH_MASK || pos == 2)
+        if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2)
                /*
                 * The directory was seek'ed to a senseless position or there
                 * are no more entries.
@@ -384,19 +383,9 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
        file->f_version = 1;
        /* File positions 0 and 1 correspond to "." and ".." */
-        if (pos == 0) {
+        if (ctx->pos < 2) {
-                ubifs_assert(!file->private_data);
-                over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
-                if (over)
-                        return 0;
-                file->f_pos = pos = 1;
-        }
-        if (pos == 1) {
                ubifs_assert(!file->private_data);
-                over = filldir(dirent, "..", 2, 1,
+                if (!dir_emit_dots(file, ctx))
-                               parent_ino(file->f_path.dentry), DT_DIR);
-                if (over)
                        return 0;
                /* Find the first entry in TNC and save it */
@@ -408,7 +397,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
                        goto out;
                }
-                file->f_pos = pos = key_hash_flash(c, &dent->key);
+                ctx->pos = key_hash_flash(c, &dent->key);
                file->private_data = dent;
        }
@@ -416,16 +405,16 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
        if (!dent) {
                /*
                 * The directory was seek'ed to and is now readdir'ed.
-                 * Find the entry corresponding to @pos or the closest one.
+                 * Find the entry corresponding to @ctx->pos or the closest one.
                 */
-                dent_key_init_hash(c, &key, dir->i_ino, pos);
+                dent_key_init_hash(c, &key, dir->i_ino, ctx->pos);
                nm.name = NULL;
                dent = ubifs_tnc_next_ent(c, &key, &nm);
                if (IS_ERR(dent)) {
                        err = PTR_ERR(dent);
                        goto out;
                }
-                file->f_pos = pos = key_hash_flash(c, &dent->key);
+                ctx->pos = key_hash_flash(c, &dent->key);
                file->private_data = dent;
        }
@@ -437,10 +426,9 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
                             ubifs_inode(dir)->creat_sqnum);
                nm.len = le16_to_cpu(dent->nlen);
-                over = filldir(dirent, dent->name, nm.len, pos,
+                if (!dir_emit(ctx, dent->name, nm.len,
                               le64_to_cpu(dent->inum),
-                               vfs_dent_type(dent->type));
+                               vfs_dent_type(dent->type)))
-                if (over)
                        return 0;
                /* Switch to the next entry */
@@ -453,17 +441,9 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
                }
                kfree(file->private_data);
-                file->f_pos = pos = key_hash_flash(c, &dent->key);
+                ctx->pos = key_hash_flash(c, &dent->key);
                file->private_data = dent;
                cond_resched();
-                if (file->f_version == 0)
-                        /*
-                         * The file was seek'ed meanwhile, lets return and start
-                         * reading direntries from the new position on the next
-                         * invocation.
-                         */
-                        return 0;
        }
 out:
@@ -475,15 +455,10 @@ out:
        kfree(file->private_data);
        file->private_data = NULL;
        /* 2 is a special value indicating that there are no more direntries */
-        file->f_pos = 2;
+        ctx->pos = 2;
        return 0;
 }
-static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
-{
-        return generic_file_llseek(file, offset, whence);
-}
 /* Free saved readdir() state when the directory is closed */
 static int ubifs_dir_release(struct inode *dir, struct file *file)
 {
@@ -1201,10 +1176,10 @@ const struct inode_operations ubifs_dir_inode_operations = {
 };
 const struct file_operations ubifs_dir_operations = {
-        .llseek         = ubifs_dir_llseek,
+        .llseek         = generic_file_llseek,
        .release        = ubifs_dir_release,
        .read           = generic_read_dir,
-        .readdir        = ubifs_readdir,
+        .iterate        = ubifs_readdir,
        .fsync          = ubifs_fsync,
        .unlocked_ioctl = ubifs_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 14374530784c..123c79b7261e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1277,13 +1277,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
        return err;
 }
-static void ubifs_invalidatepage(struct page *page, unsigned long offset)
+static void ubifs_invalidatepage(struct page *page, unsigned int offset,
+                                 unsigned int length)
 {
        struct inode *inode = page->mapping->host;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
        ubifs_assert(PagePrivate(page));
-        if (offset)
+        if (offset || length < PAGE_CACHE_SIZE)
                /* Partial page remains dirty */
                return;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f21acf0ef01f..879b9976c12b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1412,7 +1412,7 @@ static int mount_ubifs(struct ubifs_info *c)
        ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s",
                  c->vi.ubi_num, c->vi.vol_id, c->vi.name,
-                  c->ro_mount ? ", R/O mode" : NULL);
+                  c->ro_mount ? ", R/O mode" : "");
        x = (long long)c->main_lebs * c->leb_size;
        y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
        ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index b3e93f5e17c3..a012c51caffd 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -35,14 +35,16 @@
 #include "udf_i.h"
 #include "udf_sb.h"
-static int do_udf_readdir(struct inode *dir, struct file *filp,
-                          filldir_t filldir, void *dirent)
+static int udf_readdir(struct file *file, struct dir_context *ctx)
 {
+        struct inode *dir = file_inode(file);
+        struct udf_inode_info *iinfo = UDF_I(dir);
        struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL};
        struct fileIdentDesc *fi = NULL;
        struct fileIdentDesc cfi;
        int block, iblock;
-        loff_t nf_pos = (filp->f_pos - 1) << 2;
+        loff_t nf_pos;
        int flen;
        unsigned char *fname = NULL;
        unsigned char *nameptr;
@@ -54,10 +56,14 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
        uint32_t elen;
        sector_t offset;
        int i, num, ret = 0;
-        unsigned int dt_type;
        struct extent_position epos = { NULL, 0, {0, 0} };
-        struct udf_inode_info *iinfo;
+        if (ctx->pos == 0) {
+                if (!dir_emit_dot(file, ctx))
+                        return 0;
+                ctx->pos = 1;
+        }
+        nf_pos = (ctx->pos - 1) << 2;
        if (nf_pos >= size)
                goto out;
@@ -71,7 +77,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
                nf_pos = udf_ext0_offset(dir);
        fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
-        iinfo = UDF_I(dir);
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
                if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
                    &epos, &eloc, &elen, &offset)
@@ -116,7 +121,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
        }
        while (nf_pos < size) {
-                filp->f_pos = (nf_pos >> 2) + 1;
+                struct kernel_lb_addr tloc;
+                ctx->pos = (nf_pos >> 2) + 1;
                fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
                                        &elen, &offset);
@@ -155,24 +162,22 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
                }
                if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) {
-                        iblock = parent_ino(filp->f_path.dentry);
+                        if (!dir_emit_dotdot(file, ctx))
-                        flen = 2;
+                                goto out;
-                        memcpy(fname, "..", flen);
+                        continue;
-                        dt_type = DT_DIR;
-                } else {
-                        struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
-                        iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
-                        flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
-                        dt_type = DT_UNKNOWN;
                }
-                if (flen && filldir(dirent, fname, flen, filp->f_pos,
+                flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
-                                    iblock, dt_type) < 0)
+                if (!flen)
+                        continue;
+                tloc = lelb_to_cpu(cfi.icb.extLocation);
+                iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
+                if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN))
                        goto out;
        } /* end while */
-        filp->f_pos = (nf_pos >> 2) + 1;
+        ctx->pos = (nf_pos >> 2) + 1;
 out:
        if (fibh.sbh != fibh.ebh)
@@ -184,27 +189,11 @@ out:
        return ret;
 }
-static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-        struct inode *dir = file_inode(filp);
-        int result;
-        if (filp->f_pos == 0) {
-                if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
-                        return 0;
-                }
-                filp->f_pos++;
-        }
-        result = do_udf_readdir(dir, filp, filldir, dirent);
-        return result;
-}
 /* readdir and lookup functions */
 const struct file_operations udf_dir_operations = {
        .llseek                 = generic_file_llseek,
        .read                   = generic_read_dir,
-        .readdir                = udf_readdir,
+        .iterate                = udf_readdir,
        .unlocked_ioctl         = udf_ioctl,
        .fsync                  = generic_file_fsync,
 };
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 102c072c6bbf..5f6fc17d6bc5 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -594,6 +594,29 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        return 0;
 }
+static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+        struct inode *inode;
+        struct udf_inode_info *iinfo;
+        int err;
+        inode = udf_new_inode(dir, mode, &err);
+        if (!inode)
+                return err;
+        iinfo = UDF_I(inode);
+        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+                inode->i_data.a_ops = &udf_adinicb_aops;
+        else
+                inode->i_data.a_ops = &udf_aops;
+        inode->i_op = &udf_file_inode_operations;
+        inode->i_fop = &udf_file_operations;
+        mark_inode_dirty(inode);
+        d_tmpfile(dentry, inode);
+        return 0;
+}
 static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
                     dev_t rdev)
 {
@@ -1311,6 +1334,7 @@ const struct inode_operations udf_dir_inode_operations = {
        .rmdir                          = udf_rmdir,
        .mknod                          = udf_mknod,
        .rename                         = udf_rename,
+        .tmpfile                        = udf_tmpfile,
 };
 const struct inode_operations udf_symlink_inode_operations = {
        .readlink       = generic_readlink,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 3a75ca09c506..0ecc2cebed8f 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -430,16 +430,16 @@ ufs_validate_entry(struct super_block *sb, char *base,
 * This is blatantly stolen from ext2fs
 */
 static int
-ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+ufs_readdir(struct file *file, struct dir_context *ctx)
 {
-        loff_t pos = filp->f_pos;
+        loff_t pos = ctx->pos;
-        struct inode *inode = file_inode(filp);
+        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        unsigned int offset = pos & ~PAGE_CACHE_MASK;
        unsigned long n = pos >> PAGE_CACHE_SHIFT;
        unsigned long npages = ufs_dir_pages(inode);
        unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
-        int need_revalidate = filp->f_version != inode->i_version;
+        int need_revalidate = file->f_version != inode->i_version;
        unsigned flags = UFS_SB(sb)->s_flags;
        UFSD("BEGIN\n");
@@ -457,16 +457,16 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        ufs_error(sb, __func__,
                                  "bad page in #%lu",
                                  inode->i_ino);
-                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        ctx->pos += PAGE_CACHE_SIZE - offset;
                        return -EIO;
                }
                kaddr = page_address(page);
                if (unlikely(need_revalidate)) {
                        if (offset) {
                                offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
-                                filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+                                ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
                        }
-                        filp->f_version = inode->i_version;
+                        file->f_version = inode->i_version;
                        need_revalidate = 0;
                }
                de = (struct ufs_dir_entry *)(kaddr+offset);
@@ -479,11 +479,8 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                return -EIO;
                        }
                        if (de->d_ino) {
-                                int over;
                                unsigned char d_type = DT_UNKNOWN;
-                                offset = (char *)de - kaddr;
                                UFSD("filldir(%s,%u)\n", de->d_name,
                                      fs32_to_cpu(sb, de->d_ino));
                                UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
@@ -491,16 +488,15 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
                                        d_type = de->d_u.d_44.d_type;
-                                over = filldir(dirent, de->d_name,
+                                if (!dir_emit(ctx, de->d_name,
                                               ufs_get_de_namlen(sb, de),
-                                                (n<<PAGE_CACHE_SHIFT) | offset,
+                                               fs32_to_cpu(sb, de->d_ino),
-                                               fs32_to_cpu(sb, de->d_ino), d_type);
+                                               d_type)) {
-                                if (over) {
                                        ufs_put_page(page);
                                        return 0;
                                }
                        }
-                        filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
+                        ctx->pos += fs16_to_cpu(sb, de->d_reclen);
                }
                ufs_put_page(page);
        }
@@ -660,7 +656,7 @@ not_empty:
 const struct file_operations ufs_dir_operations = {
        .read           = generic_read_dir,
-        .readdir        = ufs_readdir,
+        .iterate        = ufs_readdir,
        .fsync          = generic_file_fsync,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 6313b69b6644..4a4508023a3c 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -71,6 +71,7 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_dir2_sf.o \
                                   xfs_ialloc.o \
                                   xfs_ialloc_btree.o \
+                                   xfs_icreate_item.o \
                                   xfs_inode.o \
                                   xfs_log_recover.o \
                                   xfs_mount.o \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 5673bcfda2f0..71596e57283a 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -175,6 +175,7 @@ xfs_alloc_compute_diff(
        xfs_agblock_t   wantbno,        /* target starting block */
        xfs_extlen_t    wantlen,        /* target length */
        xfs_extlen_t    alignment,      /* target alignment */
+        char            userdata,       /* are we allocating data? */
        xfs_agblock_t   freebno,        /* freespace's starting block */
        xfs_extlen_t    freelen,        /* freespace's length */
        xfs_agblock_t   *newbnop)       /* result: best start block from free */
@@ -189,7 +190,14 @@ xfs_alloc_compute_diff(
        ASSERT(freelen >= wantlen);
        freeend = freebno + freelen;
        wantend = wantbno + wantlen;
-        if (freebno >= wantbno) {
+        /*
+         * We want to allocate from the start of a free extent if it is past
+         * the desired block or if we are allocating user data and the free
+         * extent is before desired block. The second case is there to allow
+         * for contiguous allocation from the remaining free space if the file
+         * grows in the short term.
+         */
+        if (freebno >= wantbno || (userdata && freeend < wantend)) {
                if ((newbno1 = roundup(freebno, alignment)) >= freeend)
                        newbno1 = NULLAGBLOCK;
        } else if (freeend >= wantend && alignment > 1) {
@@ -805,7 +813,8 @@ xfs_alloc_find_best_extent(
                        xfs_alloc_fix_len(args);
                        sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                                       args->alignment, *sbnoa,
+                                                       args->alignment,
+                                                       args->userdata, *sbnoa,
                                                       *slena, &new);
                        /*
@@ -976,7 +985,8 @@ restart:
                        if (args->len < blen)
                                continue;
                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                args->alignment, ltbnoa, ltlena, &ltnew);
+                                args->alignment, args->userdata, ltbnoa,
+                                ltlena, &ltnew);
                        if (ltnew != NULLAGBLOCK &&
                            (args->len > blen || ltdiff < bdiff)) {
                                bdiff = ltdiff;
@@ -1128,7 +1138,8 @@ restart:
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                args->alignment, ltbnoa, ltlena, &ltnew);
+                                args->alignment, args->userdata, ltbnoa,
+                                ltlena, &ltnew);
                        error = xfs_alloc_find_best_extent(args,
                                                &bno_cur_lt, &bno_cur_gt,
@@ -1144,7 +1155,8 @@ restart:
                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                        xfs_alloc_fix_len(args);
                        gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                args->alignment, gtbnoa, gtlena, &gtnew);
+                                args->alignment, args->userdata, gtbnoa,
+                                gtlena, &gtnew);
                        error = xfs_alloc_find_best_extent(args,
                                                &bno_cur_gt, &bno_cur_lt,
@@ -1203,7 +1215,7 @@ restart:
        }
        rlen = args->len;
        (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
-                                     ltbnoa, ltlena, &ltnew);
+                                     args->userdata, ltbnoa, ltlena, &ltnew);
        ASSERT(ltnew >= ltbno);
        ASSERT(ltnew + rlen <= ltbnoa + ltlena);
        ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 41a695048be7..596ec71da00e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -843,10 +843,12 @@ xfs_cluster_write(
 STATIC void
 xfs_vm_invalidatepage(
        struct page             *page,
-        unsigned long           offset)
+        unsigned int            offset,
+        unsigned int            length)
 {
-        trace_xfs_invalidatepage(page->mapping->host, page, offset);
+        trace_xfs_invalidatepage(page->mapping->host, page, offset,
-        block_invalidatepage(page, offset);
+                                 length);
+        block_invalidatepage(page, offset, length);
 }
 /*
@@ -910,7 +912,7 @@ next_buffer:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out_invalidate:
-        xfs_vm_invalidatepage(page, 0);
+        xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
        return;
 }
@@ -940,7 +942,7 @@ xfs_vm_writepage(
        int                     count = 0;
        int                     nonblocking = 0;
-        trace_xfs_writepage(inode, page, 0);
+        trace_xfs_writepage(inode, page, 0, 0);
        ASSERT(page_has_buffers(page));
@@ -1171,7 +1173,7 @@ xfs_vm_releasepage(
 {
        int                     delalloc, unwritten;
-        trace_xfs_releasepage(page->mapping->host, page, 0);
+        trace_xfs_releasepage(page->mapping->host, page, 0, 0);
        xfs_count_page_state(page, &delalloc, &unwritten);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 31d3cd129269..b800fbcafc7f 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -690,6 +690,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
        sf = (xfs_attr_shortform_t *)tmpbuffer;
        xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+        xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
        bp = NULL;
        error = xfs_da_grow_inode(args, &blkno);
        if (error) {
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 89042848f9ec..05c698ccb238 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1161,6 +1161,24 @@ xfs_bmap_extents_to_btree(
 * since the file data needs to get logged so things will stay consistent.
 * (The bmap-level manipulations are ok, though).
 */
+void
+xfs_bmap_local_to_extents_empty(
+        struct xfs_inode        *ip,
+        int                     whichfork)
+{
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+        ASSERT(ifp->if_bytes == 0);
+        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+        xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
+        ifp->if_flags &= ~XFS_IFINLINE;
+        ifp->if_flags |= XFS_IFEXTENTS;
+        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+}
 STATIC int                              /* error */
 xfs_bmap_local_to_extents(
        xfs_trans_t     *tp,            /* transaction pointer */
@@ -1174,9 +1192,12 @@ xfs_bmap_local_to_extents(
                                   struct xfs_inode *ip,
                                   struct xfs_ifork *ifp))
 {
-        int             error;          /* error return value */
+        int             error = 0;
        int             flags;          /* logging flags returned */
        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        xfs_alloc_arg_t args;           /* allocation arguments */
+        xfs_buf_t       *bp;            /* buffer for extent block */
+        xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
        /*
         * We don't want to deal with the case of keeping inode data inline yet.
@@ -1185,68 +1206,65 @@ xfs_bmap_local_to_extents(
        ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+        if (!ifp->if_bytes) {
+                xfs_bmap_local_to_extents_empty(ip, whichfork);
+                flags = XFS_ILOG_CORE;
+                goto done;
+        }
        flags = 0;
        error = 0;
-        if (ifp->if_bytes) {
+        ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
-                xfs_alloc_arg_t args;   /* allocation arguments */
+                                                                XFS_IFINLINE);
-                xfs_buf_t       *bp;    /* buffer for extent block */
+        memset(&args, 0, sizeof(args));
-                xfs_bmbt_rec_host_t *ep;/* extent record pointer */
+        args.tp = tp;
+        args.mp = ip->i_mount;
-                ASSERT((ifp->if_flags &
+        args.firstblock = *firstblock;
-                        (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
+        /*
-                memset(&args, 0, sizeof(args));
+         * Allocate a block.  We know we need only one, since the
-                args.tp = tp;
+         * file currently fits in an inode.
-                args.mp = ip->i_mount;
+         */
-                args.firstblock = *firstblock;
+        if (*firstblock == NULLFSBLOCK) {
-                /*
+                args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
-                 * Allocate a block.  We know we need only one, since the
+                args.type = XFS_ALLOCTYPE_START_BNO;
-                 * file currently fits in an inode.
-                 */
-                if (*firstblock == NULLFSBLOCK) {
-                        args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
-                        args.type = XFS_ALLOCTYPE_START_BNO;
-                } else {
-                        args.fsbno = *firstblock;
-                        args.type = XFS_ALLOCTYPE_NEAR_BNO;
-                }
-                args.total = total;
-                args.minlen = args.maxlen = args.prod = 1;
-                error = xfs_alloc_vextent(&args);
-                if (error)
-                        goto done;
-                /* Can't fail, the space was reserved. */
-                ASSERT(args.fsbno != NULLFSBLOCK);
-                ASSERT(args.len == 1);
-                *firstblock = args.fsbno;
-                bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-                /* initialise the block and copy the data */
-                init_fn(tp, bp, ip, ifp);
-                /* account for the change in fork size and log everything */
-                xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
-                xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
-                xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
-                xfs_iext_add(ifp, 0, 1);
-                ep = xfs_iext_get_ext(ifp, 0);
-                xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
-                trace_xfs_bmap_post_update(ip, 0,
-                                whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
-                                _THIS_IP_);
-                XFS_IFORK_NEXT_SET(ip, whichfork, 1);
-                ip->i_d.di_nblocks = 1;
-                xfs_trans_mod_dquot_byino(tp, ip,
-                        XFS_TRANS_DQ_BCOUNT, 1L);
-                flags |= xfs_ilog_fext(whichfork);
        } else {
-                ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+                args.fsbno = *firstblock;
-                xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
        }
-        ifp->if_flags &= ~XFS_IFINLINE;
+        args.total = total;
-        ifp->if_flags |= XFS_IFEXTENTS;
+        args.minlen = args.maxlen = args.prod = 1;
-        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+        error = xfs_alloc_vextent(&args);
+        if (error)
+                goto done;
+        /* Can't fail, the space was reserved. */
+        ASSERT(args.fsbno != NULLFSBLOCK);
+        ASSERT(args.len == 1);
+        *firstblock = args.fsbno;
+        bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+        /* initialise the block and copy the data */
+        init_fn(tp, bp, ip, ifp);
+        /* account for the change in fork size and log everything */
+        xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+        xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+        xfs_bmap_local_to_extents_empty(ip, whichfork);
        flags |= XFS_ILOG_CORE;
+        xfs_iext_add(ifp, 0, 1);
+        ep = xfs_iext_get_ext(ifp, 0);
+        xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+        trace_xfs_bmap_post_update(ip, 0,
+                        whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+                        _THIS_IP_);
+        XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+        ip->i_d.di_nblocks = 1;
+        xfs_trans_mod_dquot_byino(tp, ip,
+                XFS_TRANS_DQ_BCOUNT, 1L);
+        flags |= xfs_ilog_fext(whichfork);
 done:
        *logflagsp = flags;
        return error;
@@ -1323,25 +1341,6 @@ xfs_bmap_add_attrfork_extents(
 }
 /*
- * Block initialisation function for local to extent format conversion.
- *
- * This shouldn't actually be called by anyone, so make sure debug kernels cause
- * a noticable failure.
- */
-STATIC void
-xfs_bmap_local_to_extents_init_fn(
-        struct xfs_trans        *tp,
-        struct xfs_buf          *bp,
-        struct xfs_inode        *ip,
-        struct xfs_ifork        *ifp)
-{
-        ASSERT(0);
-        bp->b_ops = &xfs_bmbt_buf_ops;
-        memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
-        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
-}
-/*
 * Called from xfs_bmap_add_attrfork to handle local format files. Each
 * different data fork content type needs a different callout to do the
 * conversion. Some are basic and only require special block initialisation
@@ -1381,9 +1380,9 @@ xfs_bmap_add_attrfork_local(
                                                 flags, XFS_DATA_FORK,
                                                 xfs_symlink_local_to_remote);
-        return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
+        /* should only be called for types that support local format data */
-                                         XFS_DATA_FORK,
+        ASSERT(0);
-                                         xfs_bmap_local_to_extents_init_fn);
+        return EFSCORRUPTED;
 }
 /*
@@ -4907,20 +4906,19 @@ xfs_bmapi_write(
        orig_mval = mval;
        orig_nmap = *nmap;
 #endif
+        whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+                XFS_ATTR_FORK : XFS_DATA_FORK;
        ASSERT(*nmap >= 1);
        ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
        ASSERT(!(flags & XFS_BMAPI_IGSTATE));
        ASSERT(tp != NULL);
        ASSERT(len > 0);
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
-        whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-                XFS_ATTR_FORK : XFS_DATA_FORK;
        if (unlikely(XFS_TEST_ERROR(
            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
-             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
                XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
                return XFS_ERROR(EFSCORRUPTED);
@@ -4933,37 +4931,6 @@ xfs_bmapi_write(
        XFS_STATS_INC(xs_blk_mapw);
-        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-                /*
-                 * XXX (dgc): This assumes we are only called for inodes that
-                 * contain content neutral data in local format. Anything that
-                 * contains caller-specific data in local format that needs
-                 * transformation to move to a block format needs to do the
-                 * conversion to extent format itself.
-                 *
-                 * Directory data forks and attribute forks handle this
-                 * themselves, but with the addition of metadata verifiers every
-                 * data fork in local format now contains caller specific data
-                 * and as such conversion through this function is likely to be
-                 * broken.
-                 *
-                 * The only likely user of this branch is for remote symlinks,
-                 * but we cannot overwrite the data fork contents of the symlink
-                 * (EEXIST occurs higher up the stack) and so it will never go
-                 * from local format to extent format here. Hence I don't think
-                 * this branch is ever executed intentionally and we should
-                 * consider removing it and asserting that xfs_bmapi_write()
-                 * cannot be called directly on local format forks. i.e. callers
-                 * are completely responsible for local to extent format
-                 * conversion, not xfs_bmapi_write().
-                 */
-                error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
-                                        &bma.logflags, whichfork,
-                                        xfs_bmap_local_to_extents_init_fn);
-                if (error)
-                        goto error0;
-        }
        if (*firstblock == NULLFSBLOCK) {
                if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
                        bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 5f469c3516eb..1cf1292d29b7 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -172,6 +172,7 @@ void	xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
 #endif
 int     xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+void    xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
 void    xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
                struct xfs_bmap_free *flist, struct xfs_mount *mp);
 void    xfs_bmap_cancel(struct xfs_bmap_free *flist);
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 70c43d9f72c1..1b726d626941 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -196,6 +196,8 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
 #define XFS_BMDR_SPACE_CALC(nrecs) \
        (int)(sizeof(xfs_bmdr_block_t) + \
               ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BMDR_SPACE(bb) \
+        (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
 /*
 * Maximum number of bmap btree levels.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 4ec431777048..bfc4e0c26fd3 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -140,6 +140,16 @@ xfs_buf_item_size(
        ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+        if (bip->bli_flags & XFS_BLI_ORDERED) {
+                /*
+                 * The buffer has been logged just to order it.
+                 * It is not being included in the transaction
+                 * commit, so no vectors are used at all.
+                 */
+                trace_xfs_buf_item_size_ordered(bip);
+                return XFS_LOG_VEC_ORDERED;
+        }
        /*
         * the vector count is based on the number of buffer vectors we have
         * dirty bits in. This will only be greater than one when we have a
@@ -212,6 +222,7 @@ xfs_buf_item_format_segment(
                goto out;
        }
        /*
         * Fill in an iovec for each set of contiguous chunks.
         */
@@ -299,18 +310,36 @@ xfs_buf_item_format(
        /*
         * If it is an inode buffer, transfer the in-memory state to the
-         * format flags and clear the in-memory state. We do not transfer
+         * format flags and clear the in-memory state.
+         *
+         * For buffer based inode allocation, we do not transfer
         * this state if the inode buffer allocation has not yet been committed
         * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
         * correct replay of the inode allocation.
+         *
+         * For icreate item based inode allocation, the buffers aren't written
+         * to the journal during allocation, and hence we should always tag the
+         * buffer as an inode buffer so that the correct unlinked list replay
+         * occurs during recovery.
         */
        if (bip->bli_flags & XFS_BLI_INODE_BUF) {
-                if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+                if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
+                    !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
                      xfs_log_item_in_current_chkpt(lip)))
                        bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
                bip->bli_flags &= ~XFS_BLI_INODE_BUF;
        }
+        if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+                                                        XFS_BLI_ORDERED) {
+                /*
+                 * The buffer has been logged just to order it.  It is not being
+                 * included in the transaction commit, so don't format it.
+                 */
+                trace_xfs_buf_item_format_ordered(bip);
+                return;
+        }
        for (i = 0; i < bip->bli_format_count; i++) {
                vecp = xfs_buf_item_format_segment(bip, vecp, offset,
                                                &bip->bli_formats[i]);
@@ -340,6 +369,7 @@ xfs_buf_item_pin(
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+               (bip->bli_flags & XFS_BLI_ORDERED) ||
               (bip->bli_flags & XFS_BLI_STALE));
        trace_xfs_buf_item_pin(bip);
@@ -512,8 +542,9 @@ xfs_buf_item_unlock(
 {
        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
        struct xfs_buf          *bp = bip->bli_buf;
-        int                     aborted, clean, i;
+        bool                    clean;
-        uint                    hold;
+        bool                    aborted;
+        int                     flags;
        /* Clear the buffer's association with this transaction. */
        bp->b_transp = NULL;
@@ -524,23 +555,21 @@ xfs_buf_item_unlock(
         * (cancelled) buffers at unpin time, but we'll never go through the
         * pin/unpin cycle if we abort inside commit.
         */
-        aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
+        aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
        /*
-         * Before possibly freeing the buf item, determine if we should
+         * Before possibly freeing the buf item, copy the per-transaction state
-         * release the buffer at the end of this routine.
+         * so we can reference it safely later after clearing it from the
+         * buffer log item.
         */
-        hold = bip->bli_flags & XFS_BLI_HOLD;
+        flags = bip->bli_flags;
+        bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
-        /* Clear the per transaction state. */
-        bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
        /*
         * If the buf item is marked stale, then don't do anything.  We'll
         * unlock the buffer and free the buf item when the buffer is unpinned
         * for the last time.
         */
-        if (bip->bli_flags & XFS_BLI_STALE) {
+        if (flags & XFS_BLI_STALE) {
                trace_xfs_buf_item_unlock_stale(bip);
                ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
                if (!aborted) {
@@ -557,13 +586,19 @@ xfs_buf_item_unlock(
         * be the only reference to the buf item, so we free it anyway
         * regardless of whether it is dirty or not. A dirty abort implies a
         * shutdown, anyway.
+         *
+         * Ordered buffers are dirty but may have no recorded changes, so ensure
+         * we only release clean items here.
         */
-        clean = 1;
+        clean = (flags & XFS_BLI_DIRTY) ? false : true;
-        for (i = 0; i < bip->bli_format_count; i++) {
+        if (clean) {
-                if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+                int i;
-                             bip->bli_formats[i].blf_map_size)) {
+                for (i = 0; i < bip->bli_format_count; i++) {
-                        clean = 0;
+                        if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
-                        break;
+                                     bip->bli_formats[i].blf_map_size)) {
+                                clean = false;
+                                break;
+                        }
                }
        }
        if (clean)
@@ -576,7 +611,7 @@ xfs_buf_item_unlock(
        } else
                atomic_dec(&bip->bli_refcount);
-        if (!hold)
+        if (!(flags & XFS_BLI_HOLD))
                xfs_buf_relse(bp);
 }
@@ -842,12 +877,6 @@ xfs_buf_item_log(
        struct xfs_buf          *bp = bip->bli_buf;
        /*
-         * Mark the item as having some dirty data for
-         * quick reference in xfs_buf_item_dirty.
-         */
-        bip->bli_flags |= XFS_BLI_DIRTY;
-        /*
         * walk each buffer segment and mark them dirty appropriately.
         */
        start = 0;
@@ -873,7 +902,7 @@ xfs_buf_item_log(
 /*
- * Return 1 if the buffer has some data that has been logged (at any
+ * Return 1 if the buffer has been logged or ordered in a transaction (at any
 * point, not just the current transaction) and 0 if not.
 */
 uint
@@ -907,11 +936,11 @@ void
 xfs_buf_item_relse(
        xfs_buf_t       *bp)
 {
-        xfs_buf_log_item_t      *bip;
+        xfs_buf_log_item_t      *bip = bp->b_fspriv;
        trace_xfs_buf_item_relse(bp, _RET_IP_);
+        ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
-        bip = bp->b_fspriv;
        bp->b_fspriv = bip->bli_item.li_bio_list;
        if (bp->b_fspriv == NULL)
                bp->b_iodone = NULL;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 2573d2a75fc8..0f1c247dc680 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -120,6 +120,7 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
 #define XFS_BLI_INODE_ALLOC_BUF 0x10
 #define XFS_BLI_STALE_INODE     0x20
 #define XFS_BLI_INODE_BUF       0x40
+#define XFS_BLI_ORDERED         0x80
 #define XFS_BLI_FLAGS \
        { XFS_BLI_HOLD,         "HOLD" }, \
@@ -128,7 +129,8 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
        { XFS_BLI_LOGGED,       "LOGGED" }, \
        { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
        { XFS_BLI_STALE_INODE,  "STALE_INODE" }, \
-        { XFS_BLI_INODE_BUF,    "INODE_BUF" }
+        { XFS_BLI_INODE_BUF,    "INODE_BUF" }, \
+        { XFS_BLI_ORDERED,      "ORDERED" }
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index c407e1ccff43..e36445ceaf80 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,6 +24,9 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -182,7 +185,7 @@ xfs_swap_extents_check_format(
         */
        if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
                if (XFS_IFORK_BOFF(ip) &&
-                    tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+                    XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
                        return EINVAL;
                if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
                    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
@@ -192,9 +195,8 @@ xfs_swap_extents_check_format(
        /* Reciprocal target->temp btree format checks */
        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
                if (XFS_IFORK_BOFF(tip) &&
-                    ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+                    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
                        return EINVAL;
                if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
                    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
                        return EINVAL;
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index f7a0e95d197a..e5869b50dc41 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -39,6 +39,9 @@ typedef struct xfs_timestamp {
 * There is a very similar struct icdinode in xfs_inode which matches the
 * layout of the first 96 bytes of this structure, but is kept in native
 * format instead of big endian.
+ *
+ * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
+ * padding field for v3 inodes.
 */
 typedef struct xfs_dinode {
        __be16          di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
@@ -132,9 +135,6 @@ typedef enum xfs_dinode_fmt {
 #define XFS_LITINO(mp, version) \
        ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
-#define XFS_BROOT_SIZE_ADJ(ip) \
-        (XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t))
 /*
 * Inode data & attribute fork sizes, per inode.
 */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index b26a50f9921d..8f023dee404d 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -368,10 +368,8 @@ xfs_dir_removename(
 int
 xfs_readdir(
        xfs_inode_t     *dp,
-        void            *dirent,
+        struct dir_context *ctx,
-        size_t          bufsize,
+        size_t          bufsize)
-        xfs_off_t       *offset,
-        filldir_t       filldir)
 {
        int             rval;           /* return value */
        int             v;              /* type-checking value */
@@ -385,14 +383,13 @@ xfs_readdir(
        XFS_STATS_INC(xs_dir_getdents);
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
-                rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir);
+                rval = xfs_dir2_sf_getdents(dp, ctx);
        else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
                ;
        else if (v)
-                rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir);
+                rval = xfs_dir2_block_getdents(dp, ctx);
        else
-                rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset,
+                rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
-                                              filldir);
        return rval;
 }
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e59f5fc816fe..5e7fbd72cf52 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -29,6 +29,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
+#include "xfs_bmap.h"
 #include "xfs_buf_item.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_format.h"
@@ -569,9 +570,7 @@ xfs_dir2_block_addname(
 int                                             /* error */
 xfs_dir2_block_getdents(
        xfs_inode_t             *dp,            /* incore inode */
-        void                    *dirent,
+        struct dir_context      *ctx)
-        xfs_off_t               *offset,
-        filldir_t               filldir)
 {
        xfs_dir2_data_hdr_t     *hdr;           /* block header */
        struct xfs_buf          *bp;            /* buffer for block */
@@ -589,7 +588,7 @@ xfs_dir2_block_getdents(
        /*
         * If the block number in the offset is out of range, we're done.
         */
-        if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+        if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
                return 0;
        error = xfs_dir3_block_read(NULL, dp, &bp);
@@ -600,7 +599,7 @@ xfs_dir2_block_getdents(
         * Extract the byte offset we start at from the seek pointer.
         * We'll skip entries before this.
         */
-        wantoff = xfs_dir2_dataptr_to_off(mp, *offset);
+        wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
        hdr = bp->b_addr;
        xfs_dir3_data_check(dp, bp);
        /*
@@ -639,13 +638,12 @@ xfs_dir2_block_getdents(
                cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
                                            (char *)dep - (char *)hdr);
+                ctx->pos = cook & 0x7fffffff;
                /*
                 * If it didn't fit, set the final offset to here & return.
                 */
-                if (filldir(dirent, (char *)dep->name, dep->namelen,
+                if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
-                            cook & 0x7fffffff, be64_to_cpu(dep->inumber),
+                            be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
-                            DT_UNKNOWN)) {
-                        *offset = cook & 0x7fffffff;
                        xfs_trans_brelse(NULL, bp);
                        return 0;
                }
@@ -655,7 +653,7 @@ xfs_dir2_block_getdents(
         * Reached the end of the block.
         * Set the offset to a non-existent block 1 and return.
         */
-        *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+        ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
                        0x7fffffff;
        xfs_trans_brelse(NULL, bp);
        return 0;
@@ -1167,13 +1165,15 @@ xfs_dir2_sf_to_block(
        __be16                  *tagp;          /* end of data entry */
        xfs_trans_t             *tp;            /* transaction pointer */
        struct xfs_name         name;
+        struct xfs_ifork        *ifp;
        trace_xfs_dir2_sf_to_block(args);
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
-        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+        ASSERT(ifp->if_flags & XFS_IFINLINE);
        /*
         * Bomb out if the shortform directory is way too short.
         */
@@ -1182,22 +1182,23 @@ xfs_dir2_sf_to_block(
                return XFS_ERROR(EIO);
        }
-        oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
-        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(ifp->if_bytes == dp->i_d.di_size);
-        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        ASSERT(ifp->if_u1.if_data != NULL);
        ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
+        ASSERT(dp->i_d.di_nextents == 0);
        /*
         * Copy the directory into a temporary buffer.
         * Then pitch the incore inode data so we can make extents.
         */
-        sfp = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP);
+        sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
-        memcpy(sfp, oldsfp, dp->i_df.if_bytes);
+        memcpy(sfp, oldsfp, ifp->if_bytes);
-        xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK);
+        xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
+        xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
        dp->i_d.di_size = 0;
-        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        /*
         * Add block 0 to the inode.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index da71a1819d78..2aed25cae04d 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1108,6 +1108,7 @@ xfs_dir2_leaf_readbuf(
        struct xfs_mount        *mp = dp->i_mount;
        struct xfs_buf          *bp = *bpp;
        struct xfs_bmbt_irec    *map = mip->map;
+        struct blk_plug         plug;
        int                     error = 0;
        int                     length;
        int                     i;
@@ -1236,6 +1237,7 @@ xfs_dir2_leaf_readbuf(
        /*
         * Do we need more readahead?
         */
+        blk_start_plug(&plug);
        for (mip->ra_index = mip->ra_offset = i = 0;
             mip->ra_want > mip->ra_current && i < mip->map_blocks;
             i += mp->m_dirblkfsbs) {
@@ -1287,6 +1289,7 @@ xfs_dir2_leaf_readbuf(
                        }
                }
        }
+        blk_finish_plug(&plug);
 out:
        *bpp = bp;
@@ -1300,10 +1303,8 @@ out:
 int                                             /* error */
 xfs_dir2_leaf_getdents(
        xfs_inode_t             *dp,            /* incore directory inode */
-        void                    *dirent,
+        struct dir_context      *ctx,
-        size_t                  bufsize,
+        size_t                  bufsize)
-        xfs_off_t               *offset,
-        filldir_t               filldir)
 {
        struct xfs_buf          *bp = NULL;     /* data block buffer */
        xfs_dir2_data_hdr_t     *hdr;           /* data block header */
@@ -1322,7 +1323,7 @@ xfs_dir2_leaf_getdents(
         * If the offset is at or past the largest allowed value,
         * give up right away.
         */
-        if (*offset >= XFS_DIR2_MAX_DATAPTR)
+        if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
                return 0;
        mp = dp->i_mount;
@@ -1343,7 +1344,7 @@ xfs_dir2_leaf_getdents(
         * Inside the loop we keep the main offset value as a byte offset
         * in the directory file.
         */
-        curoff = xfs_dir2_dataptr_to_byte(mp, *offset);
+        curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
        /*
         * Force this conversion through db so we truncate the offset
@@ -1444,8 +1445,8 @@ xfs_dir2_leaf_getdents(
                dep = (xfs_dir2_data_entry_t *)ptr;
                length = xfs_dir2_data_entsize(dep->namelen);
-                if (filldir(dirent, (char *)dep->name, dep->namelen,
+                ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
-                            xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
+                if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
                            be64_to_cpu(dep->inumber), DT_UNKNOWN))
                        break;
@@ -1462,9 +1463,9 @@ xfs_dir2_leaf_getdents(
         * All done.  Set output offset value to current offset.
         */
        if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
-                *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+                ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
        else
-                *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+                ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
        kmem_free(map_info);
        if (bp)
                xfs_trans_brelse(NULL, bp);
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 7cf573c88aad..0511cda4a712 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -33,8 +33,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
 extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
 extern int xfs_dir2_block_addname(struct xfs_da_args *args);
-extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
+extern int xfs_dir2_block_getdents(struct xfs_inode *dp,
-                xfs_off_t *offset, filldir_t filldir);
+                struct dir_context *ctx);
 extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_block_removename(struct xfs_da_args *args);
 extern int xfs_dir2_block_replace(struct xfs_da_args *args);
@@ -91,8 +91,8 @@ extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
 extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
                struct xfs_dir2_leaf_entry *ents, int *indexp,
                int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
-extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent,
+extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx,
-                size_t bufsize, xfs_off_t *offset, filldir_t filldir);
+                size_t bufsize);
 extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
                struct xfs_buf **bpp, __uint16_t magic);
 extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
@@ -153,8 +153,7 @@ extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
                int size, xfs_dir2_sf_hdr_t *sfhp);
 extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
 extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
-extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent,
+extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx);
-                xfs_off_t *offset, filldir_t filldir);
 extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
 extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 6157424dbf8f..97676a347da1 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -768,9 +768,7 @@ xfs_dir2_sf_create(
 int                                             /* error */
 xfs_dir2_sf_getdents(
        xfs_inode_t             *dp,            /* incore directory inode */
-        void                    *dirent,
+        struct dir_context      *ctx)
-        xfs_off_t               *offset,
-        filldir_t               filldir)
 {
        int                     i;              /* shortform entry number */
        xfs_mount_t             *mp;            /* filesystem mount point */
@@ -802,7 +800,7 @@ xfs_dir2_sf_getdents(
        /*
         * If the block number in the offset is out of range, we're done.
         */
-        if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+        if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
                return 0;
        /*
@@ -819,22 +817,20 @@ xfs_dir2_sf_getdents(
        /*
         * Put . entry unless we're starting past it.
         */
-        if (*offset <= dot_offset) {
+        if (ctx->pos <= dot_offset) {
-                if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
+                ctx->pos = dot_offset & 0x7fffffff;
-                        *offset = dot_offset & 0x7fffffff;
+                if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
                        return 0;
-                }
        }
        /*
         * Put .. entry unless we're starting past it.
         */
-        if (*offset <= dotdot_offset) {
+        if (ctx->pos <= dotdot_offset) {
                ino = xfs_dir2_sf_get_parent_ino(sfp);
-                if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
+                ctx->pos = dotdot_offset & 0x7fffffff;
-                        *offset = dotdot_offset & 0x7fffffff;
+                if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
                        return 0;
-                }
        }
        /*
@@ -845,21 +841,20 @@ xfs_dir2_sf_getdents(
                off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
                                xfs_dir2_sf_get_offset(sfep));
-                if (*offset > off) {
+                if (ctx->pos > off) {
                        sfep = xfs_dir2_sf_nextentry(sfp, sfep);
                        continue;
                }
                ino = xfs_dir2_sfe_get_ino(sfp, sfep);
-                if (filldir(dirent, (char *)sfep->name, sfep->namelen,
+                ctx->pos = off & 0x7fffffff;
-                            off & 0x7fffffff, ino, DT_UNKNOWN)) {
+                if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen,
-                        *offset = off & 0x7fffffff;
+                            ino, DT_UNKNOWN))
                        return 0;
-                }
                sfep = xfs_dir2_sf_nextentry(sfp, sfep);
        }
-        *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+        ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
                        0x7fffffff;
        return 0;
 }
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 044e97a33c8d..0adf27ecf3f1 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -570,13 +570,13 @@ xfs_qm_dqtobp(
        xfs_buf_t               **O_bpp,
        uint                    flags)
 {
-        xfs_bmbt_irec_t map;
+        struct xfs_bmbt_irec    map;
-        int             nmaps = 1, error;
+        int                     nmaps = 1, error;
-        xfs_buf_t       *bp;
+        struct xfs_buf          *bp;
-        xfs_inode_t     *quotip = XFS_DQ_TO_QIP(dqp);
+        struct xfs_inode        *quotip = xfs_dq_to_quota_inode(dqp);
-        xfs_mount_t     *mp = dqp->q_mount;
+        struct xfs_mount        *mp = dqp->q_mount;
-        xfs_dqid_t      id = be32_to_cpu(dqp->q_core.d_id);
+        xfs_dqid_t              id = be32_to_cpu(dqp->q_core.d_id);
-        xfs_trans_t     *tp = (tpp ? *tpp : NULL);
+        struct xfs_trans        *tp = (tpp ? *tpp : NULL);
        dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
@@ -804,7 +804,7 @@ xfs_qm_dqget(
        xfs_dquot_t     **O_dqpp) /* OUT : locked incore dquot */
 {
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
-        struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+        struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
        struct xfs_dquot        *dqp;
        int                     error;
@@ -936,6 +936,7 @@ xfs_qm_dqput_final(
 {
        struct xfs_quotainfo    *qi = dqp->q_mount->m_quotainfo;
        struct xfs_dquot        *gdqp;
+        struct xfs_dquot        *pdqp;
        trace_xfs_dqput_free(dqp);
@@ -949,21 +950,29 @@ xfs_qm_dqput_final(
        /*
         * If we just added a udquot to the freelist, then we want to release
-         * the gdquot reference that it (probably) has. Otherwise it'll keep
+         * the gdquot/pdquot reference that it (probably) has. Otherwise it'll
-         * the gdquot from getting reclaimed.
+         * keep the gdquot/pdquot from getting reclaimed.
         */
        gdqp = dqp->q_gdquot;
        if (gdqp) {
                xfs_dqlock(gdqp);
                dqp->q_gdquot = NULL;
        }
+        pdqp = dqp->q_pdquot;
+        if (pdqp) {
+                xfs_dqlock(pdqp);
+                dqp->q_pdquot = NULL;
+        }
        xfs_dqunlock(dqp);
        /*
-         * If we had a group quota hint, release it now.
+         * If we had a group/project quota hint, release it now.
         */
        if (gdqp)
                xfs_qm_dqput(gdqp);
+        if (pdqp)
+                xfs_qm_dqput(pdqp);
 }
 /*
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 4f0ebfc43cc9..55abbca2883d 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -53,6 +53,7 @@ typedef struct xfs_dquot {
        xfs_fileoff_t    q_fileoffset;  /* offset in quotas file */
        struct xfs_dquot*q_gdquot;      /* group dquot, hint only */
+        struct xfs_dquot*q_pdquot;      /* project dquot, hint only */
        xfs_disk_dquot_t q_core;        /* actual usage & quotas */
        xfs_dq_logitem_t q_logitem;     /* dquot log item */
        xfs_qcnt_t       q_res_bcount;  /* total regular nblks used+reserved */
@@ -118,8 +119,9 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
        case XFS_DQ_USER:
                return XFS_IS_UQUOTA_ON(mp);
        case XFS_DQ_GROUP:
+                return XFS_IS_GQUOTA_ON(mp);
        case XFS_DQ_PROJ:
-                return XFS_IS_OQUOTA_ON(mp);
+                return XFS_IS_PQUOTA_ON(mp);
        default:
                return 0;
        }
@@ -131,8 +133,9 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
        case XFS_DQ_USER:
                return ip->i_udquot;
        case XFS_DQ_GROUP:
-        case XFS_DQ_PROJ:
                return ip->i_gdquot;
+        case XFS_DQ_PROJ:
+                return ip->i_pdquot;
        default:
                return NULL;
        }
@@ -143,10 +146,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
 #define XFS_QM_ISUDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_USER)
 #define XFS_QM_ISPDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_PROJ)
 #define XFS_QM_ISGDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_GROUP)
-#define XFS_DQ_TO_QINF(dqp)     ((dqp)->q_mount->m_quotainfo)
-#define XFS_DQ_TO_QIP(dqp)      (XFS_QM_ISUDQ(dqp) ? \
-                                 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
-                                 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
 extern int              xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
                                        uint, struct xfs_dquot  **);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a5f2042aec8b..de3dc98f4e8f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -906,11 +906,10 @@ xfs_file_release(
 STATIC int
 xfs_file_readdir(
-        struct file     *filp,
+        struct file     *file,
-        void            *dirent,
+        struct dir_context *ctx)
-        filldir_t       filldir)
 {
-        struct inode    *inode = file_inode(filp);
+        struct inode    *inode = file_inode(file);
        xfs_inode_t     *ip = XFS_I(inode);
        int             error;
        size_t          bufsize;
@@ -929,8 +928,7 @@ xfs_file_readdir(
         */
        bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
-        error = xfs_readdir(ip, dirent, bufsize,
+        error = xfs_readdir(ip, ctx, bufsize);
-                                (xfs_off_t *)&filp->f_pos, filldir);
        if (error)
                return -error;
        return 0;
@@ -1270,8 +1268,7 @@ xfs_seek_data(
        }
 out:
-        if (offset != file->f_pos)
+        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-                file->f_pos = offset;
 out_unlock:
        xfs_iunlock_map_shared(ip, lock);
@@ -1379,8 +1376,7 @@ out:
         * situation in particular.
         */
        offset = min_t(loff_t, offset, isize);
-        if (offset != file->f_pos)
+        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-                file->f_pos = offset;
 out_unlock:
        xfs_iunlock_map_shared(ip, lock);
@@ -1432,7 +1428,7 @@ const struct file_operations xfs_file_operations = {
 const struct file_operations xfs_dir_file_operations = {
        .open           = xfs_dir_open,
        .read           = generic_read_dir,
-        .readdir        = xfs_file_readdir,
+        .iterate        = xfs_file_readdir,
        .llseek         = generic_file_llseek,
        .unlocked_ioctl = xfs_file_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 3c3644ea825b..614eb0cc3608 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -176,7 +176,7 @@ xfs_growfs_data_private(
        if (!bp)
                return EIO;
        if (bp->b_error) {
-                int     error = bp->b_error;
+                error = bp->b_error;
                xfs_buf_relse(bp);
                return error;
        }
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c8f5ae1debf2..7a0c17d7ec09 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -38,6 +38,7 @@
 #include "xfs_bmap.h"
 #include "xfs_cksum.h"
 #include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
 /*
@@ -150,12 +151,16 @@ xfs_check_agi_freecount(
 #endif
 /*
- * Initialise a new set of inodes.
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
 */
-STATIC int
+int
 xfs_ialloc_inode_init(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
+        struct list_head        *buffer_list,
        xfs_agnumber_t          agno,
        xfs_agblock_t           agbno,
        xfs_agblock_t           length,
@@ -208,6 +213,18 @@ xfs_ialloc_inode_init(
                version = 3;
                ino = XFS_AGINO_TO_INO(mp, agno,
                                       XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+                /*
+                 * log the initialisation that is about to take place as an
+                 * logical operation. This means the transaction does not
+                 * need to log the physical changes to the inode buffers as log
+                 * recovery will know what initialisation is actually needed.
+                 * Hence we only need to log the buffers as "ordered" buffers so
+                 * they track in the AIL as if they were physically logged.
+                 */
+                if (tp)
+                        xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp),
+                                        mp->m_sb.sb_inodesize, length, gen);
        } else if (xfs_sb_version_hasnlink(&mp->m_sb))
                version = 2;
        else
@@ -223,13 +240,8 @@ xfs_ialloc_inode_init(
                                         XBF_UNMAPPED);
                if (!fbuf)
                        return ENOMEM;
-                /*
-                 * Initialize all inodes in this buffer and then log them.
+                /* Initialize the inode buffers and log them appropriately. */
-                 *
-                 * XXX: It would be much better if we had just one transaction
-                 *      to log a whole cluster of inodes instead of all the
-                 *      individual transactions causing a lot of log traffic.
-                 */
                fbuf->b_ops = &xfs_inode_buf_ops;
                xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
                for (i = 0; i < ninodes; i++) {
@@ -247,18 +259,39 @@ xfs_ialloc_inode_init(
                                ino++;
                                uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
                                xfs_dinode_calc_crc(mp, free);
-                        } else {
+                        } else if (tp) {
                                /* just log the inode core */
                                xfs_trans_log_buf(tp, fbuf, ioffset,
                                                  ioffset + isize - 1);
                        }
                }
-                if (version == 3) {
-                        /* need to log the entire buffer */
+                if (tp) {
-                        xfs_trans_log_buf(tp, fbuf, 0,
+                        /*
-                                          BBTOB(fbuf->b_length) - 1);
+                         * Mark the buffer as an inode allocation buffer so it
+                         * sticks in AIL at the point of this allocation
+                         * transaction. This ensures the they are on disk before
+                         * the tail of the log can be moved past this
+                         * transaction (i.e. by preventing relogging from moving
+                         * it forward in the log).
+                         */
+                        xfs_trans_inode_alloc_buf(tp, fbuf);
+                        if (version == 3) {
+                                /*
+                                 * Mark the buffer as ordered so that they are
+                                 * not physically logged in the transaction but
+                                 * still tracked in the AIL as part of the
+                                 * transaction and pin the log appropriately.
+                                 */
+                                xfs_trans_ordered_buf(tp, fbuf);
+                                xfs_trans_log_buf(tp, fbuf, 0,
+                                                  BBTOB(fbuf->b_length) - 1);
+                        }
+                } else {
+                        fbuf->b_flags |= XBF_DONE;
+                        xfs_buf_delwri_queue(fbuf, buffer_list);
+                        xfs_buf_relse(fbuf);
                }
-                xfs_trans_inode_alloc_buf(tp, fbuf);
        }
        return 0;
 }
@@ -303,7 +336,7 @@ xfs_ialloc_ag_alloc(
         * First try to allocate inodes contiguous with the last-allocated
         * chunk of inodes.  If the filesystem is striped, this will fill
         * an entire stripe unit with inodes.
-         */
+         */
        agi = XFS_BUF_TO_AGI(agbp);
        newino = be32_to_cpu(agi->agi_newino);
        agno = be32_to_cpu(agi->agi_seqno);
@@ -402,7 +435,7 @@ xfs_ialloc_ag_alloc(
         * rather than a linear progression to prevent the next generation
         * number from being easily guessable.
         */
-        error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
+        error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
                        args.len, prandom_u32());
        if (error)
@@ -615,8 +648,7 @@ xfs_ialloc_get_rec(
        struct xfs_btree_cur    *cur,
        xfs_agino_t             agino,
        xfs_inobt_rec_incore_t  *rec,
-        int                     *done,
+        int                     *done)
-        int                     left)
 {
        int                     error;
        int                     i;
@@ -724,12 +756,12 @@ xfs_dialloc_ag(
                    pag->pagl_leftrec != NULLAGINO &&
                    pag->pagl_rightrec != NULLAGINO) {
                        error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
-                                                   &trec, &doneleft, 1);
+                                                   &trec, &doneleft);
                        if (error)
                                goto error1;
                        error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
-                                                   &rec, &doneright, 0);
+                                                   &rec, &doneright);
                        if (error)
                                goto error1;
                } else {
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index c8da3df271e6..68c07320f096 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,6 +150,14 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
 int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
                xfs_inobt_rec_incore_t *rec, int *stat);
+/*
+ * Inode chunk initialisation routine
+ */
+int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
+                          struct list_head *buffer_list,
+                          xfs_agnumber_t agno, xfs_agblock_t agbno,
+                          xfs_agblock_t length, unsigned int gen);
 extern const struct xfs_buf_ops xfs_agi_buf_ops;
 #endif  /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e3e927..3f90e1ceb8d6 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -335,7 +335,9 @@ xfs_iget_cache_miss(
        iflags = XFS_INEW;
        if (flags & XFS_IGET_DONTCACHE)
                iflags |= XFS_IDONTCACHE;
-        ip->i_udquot = ip->i_gdquot = NULL;
+        ip->i_udquot = NULL;
+        ip->i_gdquot = NULL;
+        ip->i_pdquot = NULL;
        xfs_iflags_set(ip, iflags);
        /* insert the new inode */
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index e0f138c70a2f..a01afbb3909a 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -40,7 +40,6 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
 int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
 void xfs_eofblocks_worker(struct work_struct *);
-int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
                int flags, void *args),
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
new file mode 100644
index 000000000000..7716a4e7375e
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2008-2010, 2013 Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_error.h"
+#include "xfs_icreate_item.h"
+kmem_zone_t     *xfs_icreate_zone;              /* inode create item zone */
+static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
+{
+        return container_of(lip, struct xfs_icreate_item, ic_item);
+}
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We only need one iovec for the icreate log structure.
+ */
+STATIC uint
+xfs_icreate_item_size(
+        struct xfs_log_item     *lip)
+{
+        return 1;
+}
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode create log item.
+ */
+STATIC void
+xfs_icreate_item_format(
+        struct xfs_log_item     *lip,
+        struct xfs_log_iovec    *log_vector)
+{
+        struct xfs_icreate_item *icp = ICR_ITEM(lip);
+        log_vector->i_addr = (xfs_caddr_t)&icp->ic_format;
+        log_vector->i_len  = sizeof(struct xfs_icreate_log);
+        log_vector->i_type = XLOG_REG_TYPE_ICREATE;
+}
+/* Pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_pin(
+        struct xfs_log_item     *lip)
+{
+}
+/* pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_unpin(
+        struct xfs_log_item     *lip,
+        int                     remove)
+{
+}
+STATIC void
+xfs_icreate_item_unlock(
+        struct xfs_log_item     *lip)
+{
+        struct xfs_icreate_item *icp = ICR_ITEM(lip);
+        if (icp->ic_item.li_flags & XFS_LI_ABORTED)
+                kmem_zone_free(xfs_icreate_zone, icp);
+        return;
+}
+/*
+ * Because we have ordered buffers being tracked in the AIL for the inode
+ * creation, we don't need the create item after this. Hence we can free
+ * the log item and return -1 to tell the caller we're done with the item.
+ */
+STATIC xfs_lsn_t
+xfs_icreate_item_committed(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn)
+{
+        struct xfs_icreate_item *icp = ICR_ITEM(lip);
+        kmem_zone_free(xfs_icreate_zone, icp);
+        return (xfs_lsn_t)-1;
+}
+/* item can never get into the AIL */
+STATIC uint
+xfs_icreate_item_push(
+        struct xfs_log_item     *lip,
+        struct list_head        *buffer_list)
+{
+        ASSERT(0);
+        return XFS_ITEM_SUCCESS;
+}
+/* Ordered buffers do the dependency tracking here, so this does nothing. */
+STATIC void
+xfs_icreate_item_committing(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn)
+{
+}
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static struct xfs_item_ops xfs_icreate_item_ops = {
+        .iop_size       = xfs_icreate_item_size,
+        .iop_format     = xfs_icreate_item_format,
+        .iop_pin        = xfs_icreate_item_pin,
+        .iop_unpin      = xfs_icreate_item_unpin,
+        .iop_push       = xfs_icreate_item_push,
+        .iop_unlock     = xfs_icreate_item_unlock,
+        .iop_committed  = xfs_icreate_item_committed,
+        .iop_committing = xfs_icreate_item_committing,
+};
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the icreate item.
+ */
+void
+xfs_icreate_log(
+        struct xfs_trans        *tp,
+        xfs_agnumber_t          agno,
+        xfs_agblock_t           agbno,
+        unsigned int            count,
+        unsigned int            inode_size,
+        xfs_agblock_t           length,
+        unsigned int            generation)
+{
+        struct xfs_icreate_item *icp;
+        icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+        xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
+                          &xfs_icreate_item_ops);
+        icp->ic_format.icl_type = XFS_LI_ICREATE;
+        icp->ic_format.icl_size = 1;    /* single vector */
+        icp->ic_format.icl_ag = cpu_to_be32(agno);
+        icp->ic_format.icl_agbno = cpu_to_be32(agbno);
+        icp->ic_format.icl_count = cpu_to_be32(count);
+        icp->ic_format.icl_isize = cpu_to_be32(inode_size);
+        icp->ic_format.icl_length = cpu_to_be32(length);
+        icp->ic_format.icl_gen = cpu_to_be32(generation);
+        xfs_trans_add_item(tp, &icp->ic_item);
+        tp->t_flags |= XFS_TRANS_DIRTY;
+        icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
new file mode 100644
index 000000000000..88ba8aa0bc41
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2008-2010, Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_ICREATE_ITEM_H
+#define XFS_ICREATE_ITEM_H      1
+/*
+ * on disk log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+        __uint16_t      icl_type;       /* type of log format structure */
+        __uint16_t      icl_size;       /* size of log format structure */
+        __be32          icl_ag;         /* ag being allocated in */
+        __be32          icl_agbno;      /* start block of inode range */
+        __be32          icl_count;      /* number of inodes to initialise */
+        __be32          icl_isize;      /* size of inodes */
+        __be32          icl_length;     /* length of extent to initialise */
+        __be32          icl_gen;        /* inode generation number to use */
+};
+/* in memory log item structure */
+struct xfs_icreate_item {
+        struct xfs_log_item     ic_item;
+        struct xfs_icreate_log  ic_format;
+};
+extern kmem_zone_t *xfs_icreate_zone;   /* inode create item zone */
+void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
+                        xfs_agblock_t agbno, unsigned int count,
+                        unsigned int inode_size, xfs_agblock_t length,
+                        unsigned int generation);
+#endif  /* XFS_ICREATE_ITEM_H */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7f7be5f98f52..bb262c25c8de 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -896,7 +896,6 @@ xfs_dinode_to_disk(
        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-        to->di_flushiter = cpu_to_be16(from->di_flushiter);
        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
        to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
        to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
@@ -924,6 +923,9 @@ xfs_dinode_to_disk(
                to->di_lsn = cpu_to_be64(from->di_lsn);
                memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
                uuid_copy(&to->di_uuid, &from->di_uuid);
+                to->di_flushiter = 0;
+        } else {
+                to->di_flushiter = cpu_to_be16(from->di_flushiter);
        }
 }
@@ -1028,6 +1030,15 @@ xfs_dinode_calc_crc(
 /*
 * Read the disk inode attributes into the in-core inode structure.
+ *
+ * For version 5 superblocks, if we are initialising a new inode and we are not
+ * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
+ * inode core with a random generation number. If we are keeping inodes around,
+ * we need to read the inode cluster to get the existing generation number off
+ * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
+ * format) then log recovery is dependent on the di_flushiter field being
+ * initialised from the current on-disk value and hence we must also read the
+ * inode off disk.
 */
 int
 xfs_iread(
@@ -1047,6 +1058,23 @@ xfs_iread(
        if (error)
                return error;
+        /* shortcut IO on inode allocation if possible */
+        if ((iget_flags & XFS_IGET_CREATE) &&
+            xfs_sb_version_hascrc(&mp->m_sb) &&
+            !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+                /* initialise the on-disk inode core */
+                memset(&ip->i_d, 0, sizeof(ip->i_d));
+                ip->i_d.di_magic = XFS_DINODE_MAGIC;
+                ip->i_d.di_gen = prandom_u32();
+                if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                        ip->i_d.di_version = 3;
+                        ip->i_d.di_ino = ip->i_ino;
+                        uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+                } else
+                        ip->i_d.di_version = 2;
+                return 0;
+        }
        /*
         * Get pointers to the on-disk inode and the buffer containing it.
         */
@@ -1133,17 +1161,16 @@ xfs_iread(
        xfs_buf_set_ref(bp, XFS_INO_REF);
        /*
-         * Use xfs_trans_brelse() to release the buffer containing the
+         * Use xfs_trans_brelse() to release the buffer containing the on-disk
-         * on-disk inode, because it was acquired with xfs_trans_read_buf()
+         * inode, because it was acquired with xfs_trans_read_buf() in
-         * in xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
+         * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
         * brelse().  If we're within a transaction, then xfs_trans_brelse()
         * will only release the buffer if it is not dirty within the
         * transaction.  It will be OK to release the buffer in this case,
-         * because inodes on disk are never destroyed and we will be
+         * because inodes on disk are never destroyed and we will be locking the
-         * locking the new in-core inode before putting it in the hash
+         * new in-core inode before putting it in the cache where other
-         * table where other processes can find it.  Thus we don't have
+         * processes can find it.  Thus we don't have to worry about the inode
-         * to worry about the inode being changed just because we released
+         * being changed just because we released the buffer.
-         * the buffer.
         */
 out_brelse:
        xfs_trans_brelse(tp, bp);
@@ -2028,8 +2055,6 @@ xfs_ifree(
        int                     error;
        int                     delete;
        xfs_ino_t               first_ino;
-        xfs_dinode_t            *dip;
-        xfs_buf_t               *ibp;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ASSERT(ip->i_d.di_nlink == 0);
@@ -2042,14 +2067,13 @@ xfs_ifree(
         * Pull the on-disk inode from the AGI unlinked list.
         */
        error = xfs_iunlink_remove(tp, ip);
-        if (error != 0) {
+        if (error)
                return error;
-        }
        error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
-        if (error != 0) {
+        if (error)
                return error;
-        }
        ip->i_d.di_mode = 0;            /* mark incore inode as free */
        ip->i_d.di_flags = 0;
        ip->i_d.di_dmevmask = 0;
@@ -2061,31 +2085,10 @@ xfs_ifree(
         * by reincarnations of this inode.
         */
        ip->i_d.di_gen++;
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp,
+        if (delete)
-                               0, 0);
-        if (error)
-                return error;
-        /*
-        * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
-        * from picking up this inode when it is reclaimed (its incore state
-        * initialzed but not flushed to disk yet). The in-core di_mode is
-        * already cleared  and a corresponding transaction logged.
-        * The hack here just synchronizes the in-core to on-disk
-        * di_mode value in advance before the actual inode sync to disk.
-        * This is OK because the inode is already unlinked and would never
-        * change its di_mode again for this inode generation.
-        * This is a temporary hack that would require a proper fix
-        * in the future.
-        */
-        dip->di_mode = 0;
-        if (delete) {
                error = xfs_ifree_cluster(ip, tp, first_ino);
-        }
        return error;
 }
@@ -2160,8 +2163,8 @@ xfs_iroot_realloc(
                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
                                                     (int)new_size);
                ifp->if_broot_bytes = (int)new_size;
-                ASSERT(ifp->if_broot_bytes <=
+                ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-                        XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip));
+                        XFS_IFORK_SIZE(ip, whichfork));
                memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
                return;
        }
@@ -2214,8 +2217,9 @@ xfs_iroot_realloc(
        kmem_free(ifp->if_broot);
        ifp->if_broot = new_broot;
        ifp->if_broot_bytes = (int)new_size;
-        ASSERT(ifp->if_broot_bytes <=
+        if (ifp->if_broot)
-                XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip));
+                ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+                        XFS_IFORK_SIZE(ip, whichfork));
        return;
 }
@@ -2526,9 +2530,8 @@ xfs_iflush_fork(
                if ((iip->ili_fields & brootflag[whichfork]) &&
                    (ifp->if_broot_bytes > 0)) {
                        ASSERT(ifp->if_broot != NULL);
-                        ASSERT(ifp->if_broot_bytes <=
+                        ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-                               (XFS_IFORK_SIZE(ip, whichfork) +
+                                XFS_IFORK_SIZE(ip, whichfork));
-                                XFS_BROOT_SIZE_ADJ(ip)));
                        xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
                                (xfs_bmdr_block_t *)cp,
                                XFS_DFORK_SIZE(dip, mp, whichfork));
@@ -2886,12 +2889,18 @@ xfs_iflush_int(
                        __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
                goto corrupt_out;
        }
        /*
-         * bump the flush iteration count, used to detect flushes which
+         * Inode item log recovery for v1/v2 inodes are dependent on the
-         * postdate a log record during recovery. This is redundant as we now
+         * di_flushiter count for correct sequencing. We bump the flush
-         * log every change and hence this can't happen. Still, it doesn't hurt.
+         * iteration count so we can detect flushes which postdate a log record
+         * during recovery. This is redundant as we now log every change and
+         * hence this can't happen but we need to still do it to ensure
+         * backwards compatibility with old kernels that predate logging all
+         * inode changes.
         */
-        ip->i_d.di_flushiter++;
+        if (ip->i_d.di_version < 3)
+                ip->i_d.di_flushiter++;
        /*
         * Copy the dirty parts of the inode into the on-disk
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 91129794aaec..b55fd347ab5b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -250,6 +250,7 @@ typedef struct xfs_inode {
        struct xfs_mount        *i_mount;       /* fs mount struct ptr */
        struct xfs_dquot        *i_udquot;      /* user dquot */
        struct xfs_dquot        *i_gdquot;      /* group dquot */
+        struct xfs_dquot        *i_pdquot;      /* project dquot */
        /* Inode location stuff */
        xfs_ino_t               i_ino;          /* inode number (agno/agino)*/
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 5e999680094a..6e2bca5d44d6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -248,7 +248,7 @@ xfs_open_by_handle(
                goto out_dput;
        }
-        fd = get_unused_fd();
+        fd = get_unused_fd_flags(0);
        if (fd < 0) {
                error = fd;
                goto out_dput;
@@ -928,7 +928,7 @@ xfs_ioctl_setattr(
        struct xfs_trans        *tp;
        unsigned int            lock_flags = 0;
        struct xfs_dquot        *udqp = NULL;
-        struct xfs_dquot        *gdqp = NULL;
+        struct xfs_dquot        *pdqp = NULL;
        struct xfs_dquot        *olddquot = NULL;
        int                     code;
@@ -957,7 +957,7 @@ xfs_ioctl_setattr(
        if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
                code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
                                         ip->i_d.di_gid, fa->fsx_projid,
-                                         XFS_QMOPT_PQUOTA, &udqp, &gdqp);
+                                         XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
                if (code)
                        return code;
        }
@@ -994,8 +994,8 @@ xfs_ioctl_setattr(
                    XFS_IS_PQUOTA_ON(mp) &&
                    xfs_get_projid(ip) != fa->fsx_projid) {
                        ASSERT(tp);
-                        code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
+                        code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL,
-                                                capable(CAP_FOWNER) ?
+                                                pdqp, capable(CAP_FOWNER) ?
                                                XFS_QMOPT_FORCE_RES : 0);
                        if (code)       /* out of quota */
                                goto error_return;
@@ -1113,7 +1113,7 @@ xfs_ioctl_setattr(
                if (xfs_get_projid(ip) != fa->fsx_projid) {
                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
                                olddquot = xfs_qm_vop_chown(tp, ip,
-                                                        &ip->i_gdquot, gdqp);
+                                                        &ip->i_pdquot, pdqp);
                        }
                        xfs_set_projid(ip, fa->fsx_projid);
@@ -1160,13 +1160,13 @@ xfs_ioctl_setattr(
         */
        xfs_qm_dqrele(olddquot);
        xfs_qm_dqrele(udqp);
-        xfs_qm_dqrele(gdqp);
+        xfs_qm_dqrele(pdqp);
        return code;
 error_return:
        xfs_qm_dqrele(udqp);
-        xfs_qm_dqrele(gdqp);
+        xfs_qm_dqrele(pdqp);
        xfs_trans_cancel(tp, 0);
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8f8aaee7f379..6a7096422295 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -284,6 +284,15 @@ xfs_iomap_eof_want_preallocate(
                return 0;
        /*
+         * If the file is smaller than the minimum prealloc and we are using
+         * dynamic preallocation, don't do any preallocation at all as it is
+         * likely this is the only write to the file that is going to be done.
+         */
+        if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
+            XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
+                return 0;
+        /*
         * If there are any real blocks past eof, then don't
         * do any speculative allocation.
         */
@@ -345,6 +354,10 @@ xfs_iomap_eof_prealloc_initial_size(
        if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
                return 0;
+        /* If the file is small, then use the minimum prealloc */
+        if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
+                return 0;
        /*
         * As we write multiple pages, the offset will always align to the
         * start of a page and hence point to a hole at EOF. i.e. if the size is
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ca9ecaa81112..96dda62d497b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -467,9 +467,6 @@ xfs_setattr_mode(
        ASSERT(tp);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
-                mode &= ~S_ISGID;
        ip->i_d.di_mode &= S_IFMT;
        ip->i_d.di_mode |= mode & ~S_IFMT;
@@ -495,15 +492,18 @@ xfs_setattr_nonsize(
        trace_xfs_setattr(ip);
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
+        /* If acls are being inherited, we already have this checked */
-                return XFS_ERROR(EROFS);
+        if (!(flags & XFS_ATTR_NOACL)) {
+                if (mp->m_flags & XFS_MOUNT_RDONLY)
+                        return XFS_ERROR(EROFS);
-        if (XFS_FORCED_SHUTDOWN(mp))
+                if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                        return XFS_ERROR(EIO);
-        error = -inode_change_ok(inode, iattr);
+                error = -inode_change_ok(inode, iattr);
-        if (error)
+                if (error)
-                return XFS_ERROR(error);
+                        return XFS_ERROR(error);
+        }
        ASSERT((mask & ATTR_SIZE) == 0);
@@ -539,7 +539,7 @@ xfs_setattr_nonsize(
                ASSERT(udqp == NULL);
                ASSERT(gdqp == NULL);
                error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
-                                         qflags, &udqp, &gdqp);
+                                         qflags, &udqp, &gdqp, NULL);
                if (error)
                        return error;
        }
@@ -575,7 +575,7 @@ xfs_setattr_nonsize(
                     (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
                        ASSERT(tp);
                        error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
-                                                capable(CAP_FOWNER) ?
+                                                NULL, capable(CAP_FOWNER) ?
                                                XFS_QMOPT_FORCE_RES : 0);
                        if (error)      /* out of quota */
                                goto out_trans_cancel;
@@ -987,7 +987,8 @@ xfs_fiemap_format(
        if (bmv->bmv_oflags & BMV_OF_PREALLOC)
                fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
        else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
-                fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+                fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
+                                 FIEMAP_EXTENT_UNKNOWN);
                physical = 0;   /* no block yet */
        }
        if (bmv->bmv_oflags & BMV_OF_LAST)
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2ea7d402188d..b93e14b86754 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -43,7 +43,7 @@ xfs_internal_inum(
 {
        return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
                (xfs_sb_version_hasquota(&mp->m_sb) &&
-                 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
+                 xfs_is_quota_inode(&mp->m_sb, ino)));
 }
 /*
@@ -221,7 +221,6 @@ xfs_bulkstat(
        char                    __user *ubufp;  /* pointer into user's buffer */
        int                     ubelem; /* spaces used in user's buffer */
        int                     ubused; /* bytes used by formatter */
-        xfs_buf_t               *bp;    /* ptr to on-disk inode cluster buf */
        /*
         * Get the last inode value, see if there's nothing to do.
@@ -263,7 +262,6 @@ xfs_bulkstat(
        rval = 0;
        while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
                cond_resched();
-                bp = NULL;
                error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
                if (error) {
                        /*
@@ -383,11 +381,13 @@ xfs_bulkstat(
                         * Also start read-ahead now for this chunk.
                         */
                        if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+                                struct blk_plug plug;
                                /*
                                 * Loop over all clusters in the next chunk.
                                 * Do a readahead if there are any allocated
                                 * inodes in that cluster.
                                 */
+                                blk_start_plug(&plug);
                                agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
                                for (chunkidx = 0;
                                     chunkidx < XFS_INODES_PER_CHUNK;
@@ -399,6 +399,7 @@ xfs_bulkstat(
                                                        agbno, nbcluster,
                                                        &xfs_inode_buf_ops);
                                }
+                                blk_finish_plug(&plug);
                                irbp->ir_startino = r.ir_startino;
                                irbp->ir_freecount = r.ir_freecount;
                                irbp->ir_free = r.ir_free;
@@ -433,27 +434,7 @@ xfs_bulkstat(
                                irbp->ir_freecount < XFS_INODES_PER_CHUNK;
                             chunkidx++, clustidx++, agino++) {
                                ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
-                                /*
-                                 * Recompute agbno if this is the
-                                 * first inode of the cluster.
-                                 *
-                                 * Careful with clustidx.   There can be
-                                 * multiple clusters per chunk, a single
-                                 * cluster per chunk or a cluster that has
-                                 * inodes represented from several different
-                                 * chunks (if blocksize is large).
-                                 *
-                                 * Because of this, the starting clustidx is
-                                 * initialized to zero in this loop but must
-                                 * later be reset after reading in the cluster
-                                 * buffer.
-                                 */
-                                if ((chunkidx & (nicluster - 1)) == 0) {
-                                        agbno = XFS_AGINO_TO_AGBNO(mp,
-                                                        irbp->ir_startino) +
-                                                ((chunkidx & nimask) >>
-                                                 mp->m_sb.sb_inopblog);
-                                }
                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
                                /*
                                 * Skip if this inode is free.
@@ -499,10 +480,6 @@ xfs_bulkstat(
                        cond_resched();
                }
-                if (bp)
-                        xfs_buf_relse(bp);
                /*
                 * Set up for the next loop iteration.
                 */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b345a7c85153..d852a2b3e1fd 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1963,6 +1963,10 @@ xlog_write_calc_vec_length(
                headers++;
        for (lv = log_vector; lv; lv = lv->lv_next) {
+                /* we don't write ordered log vectors */
+                if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
+                        continue;
                headers += lv->lv_niovecs;
                for (i = 0; i < lv->lv_niovecs; i++) {
@@ -2216,7 +2220,7 @@ xlog_write(
        index = 0;
        lv = log_vector;
        vecp = lv->lv_iovecp;
-        while (lv && index < lv->lv_niovecs) {
+        while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
                void            *ptr;
                int             log_offset;
@@ -2236,13 +2240,22 @@ xlog_write(
                 * This loop writes out as many regions as can fit in the amount
                 * of space which was allocated by xlog_state_get_iclog_space().
                 */
-                while (lv && index < lv->lv_niovecs) {
+                while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
-                        struct xfs_log_iovec    *reg = &vecp[index];
+                        struct xfs_log_iovec    *reg;
                        struct xlog_op_header   *ophdr;
                        int                     start_rec_copy;
                        int                     copy_len;
                        int                     copy_off;
+                        bool                    ordered = false;
+                        /* ordered log vectors have no regions to write */
+                        if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+                                ASSERT(lv->lv_niovecs == 0);
+                                ordered = true;
+                                goto next_lv;
+                        }
+                        reg = &vecp[index];
                        ASSERT(reg->i_len % sizeof(__int32_t) == 0);
                        ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
@@ -2302,12 +2315,13 @@ xlog_write(
                                break;
                        if (++index == lv->lv_niovecs) {
+next_lv:
                                lv = lv->lv_next;
                                index = 0;
                                if (lv)
                                        vecp = lv->lv_iovecp;
                        }
-                        if (record_cnt == 0) {
+                        if (record_cnt == 0 && ordered == false) {
                                if (!lv)
                                        return 0;
                                break;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 5caee96059df..fb630e496c12 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -88,7 +88,8 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XLOG_REG_TYPE_UNMOUNT           17
 #define XLOG_REG_TYPE_COMMIT            18
 #define XLOG_REG_TYPE_TRANSHDR          19
-#define XLOG_REG_TYPE_MAX               19
+#define XLOG_REG_TYPE_ICREATE           20
+#define XLOG_REG_TYPE_MAX               20
 typedef struct xfs_log_iovec {
        void            *i_addr;        /* beginning address of region */
@@ -105,6 +106,8 @@ struct xfs_log_vec {
        int                     lv_buf_len;     /* size of formatted buffer */
 };
+#define XFS_LOG_VEC_ORDERED     (-1)
 /*
 * Structure used to pass callback function and the function's argument
 * to the log manager.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index d0833b54e55d..02b9cf3f8252 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -127,6 +127,7 @@ xlog_cil_prepare_log_vecs(
                int     index;
                int     len = 0;
                uint    niovecs;
+                bool    ordered = false;
                /* Skip items which aren't dirty in this transaction. */
                if (!(lidp->lid_flags & XFS_LID_DIRTY))
@@ -137,14 +138,30 @@ xlog_cil_prepare_log_vecs(
                if (!niovecs)
                        continue;
+                /*
+                 * Ordered items need to be tracked but we do not wish to write
+                 * them. We need a logvec to track the object, but we do not
+                 * need an iovec or buffer to be allocated for copying data.
+                 */
+                if (niovecs == XFS_LOG_VEC_ORDERED) {
+                        ordered = true;
+                        niovecs = 0;
+                }
                new_lv = kmem_zalloc(sizeof(*new_lv) +
                                niovecs * sizeof(struct xfs_log_iovec),
                                KM_SLEEP|KM_NOFS);
+                new_lv->lv_item = lidp->lid_item;
+                new_lv->lv_niovecs = niovecs;
+                if (ordered) {
+                        /* track as an ordered logvec */
+                        new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                        goto next;
+                }
                /* The allocated iovec region lies beyond the log vector. */
                new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
-                new_lv->lv_niovecs = niovecs;
-                new_lv->lv_item = lidp->lid_item;
                /* build the vector array and calculate it's length */
                IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
@@ -165,6 +182,7 @@ xlog_cil_prepare_log_vecs(
                }
                ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
+next:
                if (!ret_lv)
                        ret_lv = new_lv;
                else
@@ -191,8 +209,18 @@ xfs_cil_prepare_item(
        if (old) {
                /* existing lv on log item, space used is a delta */
-                ASSERT(!list_empty(&lv->lv_item->li_cil));
+                ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) ||
-                ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+                        old->lv_buf_len == XFS_LOG_VEC_ORDERED);
+                /*
+                 * If the new item is ordered, keep the old one that is already
+                 * tracking dirty or ordered regions
+                 */
+                if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+                        ASSERT(!lv->lv_buf);
+                        kmem_free(lv);
+                        return;
+                }
                *len += lv->lv_buf_len - old->lv_buf_len;
                *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
@@ -201,10 +229,11 @@ xfs_cil_prepare_item(
        } else {
                /* new lv, must pin the log item */
                ASSERT(!lv->lv_item->li_lv);
-                ASSERT(list_empty(&lv->lv_item->li_cil));
-                *len += lv->lv_buf_len;
+                if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
-                *diff_iovecs += lv->lv_niovecs;
+                        *len += lv->lv_buf_len;
+                        *diff_iovecs += lv->lv_niovecs;
+                }
                IOP_PIN(lv->lv_item);
        }
@@ -259,18 +288,24 @@ xlog_cil_insert_items(
         * We can do this safely because the context can't checkpoint until we
         * are done so it doesn't matter exactly how we update the CIL.
         */
-        for (lv = log_vector; lv; lv = lv->lv_next)
-                xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
-        /* account for space used by new iovec headers  */
-        len += diff_iovecs * sizeof(xlog_op_header_t);
        spin_lock(&cil->xc_cil_lock);
+        for (lv = log_vector; lv; ) {
+                struct xfs_log_vec *next = lv->lv_next;
-        /* move the items to the tail of the CIL */
+                ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil));
-        for (lv = log_vector; lv; lv = lv->lv_next)
+                lv->lv_next = NULL;
+                /*
+                 * xfs_cil_prepare_item() may free the lv, so move the item on
+                 * the CIL first.
+                 */
                list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+                xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+                lv = next;
+        }
+        /* account for space used by new iovec headers  */
+        len += diff_iovecs * sizeof(xlog_op_header_t);
        ctx->nvecs += diff_iovecs;
        /*
@@ -381,9 +416,7 @@ xlog_cil_push(
        struct xfs_cil_ctx      *new_ctx;
        struct xlog_in_core     *commit_iclog;
        struct xlog_ticket      *tic;
-        int                     num_lv;
        int                     num_iovecs;
-        int                     len;
        int                     error = 0;
        struct xfs_trans_header thdr;
        struct xfs_log_iovec    lhdr;
@@ -428,12 +461,9 @@ xlog_cil_push(
         * side which is currently locked out by the flush lock.
         */
        lv = NULL;
-        num_lv = 0;
        num_iovecs = 0;
-        len = 0;
        while (!list_empty(&cil->xc_cil)) {
                struct xfs_log_item     *item;
-                int                     i;
                item = list_first_entry(&cil->xc_cil,
                                        struct xfs_log_item, li_cil);
@@ -444,11 +474,7 @@ xlog_cil_push(
                        lv->lv_next = item->li_lv;
                lv = item->li_lv;
                item->li_lv = NULL;
-                num_lv++;
                num_iovecs += lv->lv_niovecs;
-                for (i = 0; i < lv->lv_niovecs; i++)
-                        len += lv->lv_iovecp[i].i_len;
        }
        /*
@@ -701,6 +727,7 @@ xfs_log_commit_cil(
        if (commit_lsn)
                *commit_lsn = log->l_cilp->xc_ctx->sequence;
+        /* xlog_cil_insert_items() destroys log_vector list */
        xlog_cil_insert_items(log, log_vector, tp->t_ticket);
        /* check we didn't blow the reservation */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7cf5e4eafe28..7681b19aa5dc 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -45,6 +45,7 @@
 #include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_icreate_item.h"
 /* Need all the magic numbers and buffer ops structures from these headers */
 #include "xfs_symlink.h"
@@ -1617,7 +1618,10 @@ xlog_recover_add_to_trans(
 *         form the cancelled buffer table. Hence they have tobe done last.
 *
 *      3. Inode allocation buffers must be replayed before inode items that
- *         read the buffer and replay changes into it.
+ *         read the buffer and replay changes into it. For filesystems using the
+ *         ICREATE transactions, this means XFS_LI_ICREATE objects need to get
+ *         treated the same as inode allocation buffers as they create and
+ *         initialise the buffers directly.
 *
 *      4. Inode unlink buffers must be replayed after inode items are replayed.
 *         This ensures that inodes are completely flushed to the inode buffer
@@ -1632,10 +1636,17 @@ xlog_recover_add_to_trans(
 * from all the other buffers and move them to last.
 *
 * Hence, 4 lists, in order from head to tail:
- *      - buffer_list for all buffers except cancelled/inode unlink buffers
+ *      - buffer_list for all buffers except cancelled/inode unlink buffers
- *      - item_list for all non-buffer items
+ *      - item_list for all non-buffer items
- *      - inode_buffer_list for inode unlink buffers
+ *      - inode_buffer_list for inode unlink buffers
- *      - cancel_list for the cancelled buffers
+ *      - cancel_list for the cancelled buffers
+ *
+ * Note that we add objects to the tail of the lists so that first-to-last
+ * ordering is preserved within the lists. Adding objects to the head of the
+ * list means when we traverse from the head we walk them in last-to-first
+ * order. For cancelled buffers and inode unlink buffers this doesn't matter,
+ * but for all other items there may be specific ordering that we need to
+ * preserve.
 */
 STATIC int
 xlog_recover_reorder_trans(
@@ -1655,6 +1666,9 @@ xlog_recover_reorder_trans(
                xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
                switch (ITEM_TYPE(item)) {
+                case XFS_LI_ICREATE:
+                        list_move_tail(&item->ri_list, &buffer_list);
+                        break;
                case XFS_LI_BUF:
                        if (buf_f->blf_flags & XFS_BLF_CANCEL) {
                                trace_xfs_log_recover_item_reorder_head(log,
@@ -2578,8 +2592,16 @@ xlog_recover_inode_pass2(
                goto error;
        }
-        /* Skip replay when the on disk inode is newer than the log one */
+        /*
-        if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+         * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
+         * are transactional and if ordering is necessary we can determine that
+         * more accurately by the LSN field in the V3 inode core. Don't trust
+         * the inode versions we might be changing them here - use the
+         * superblock flag to determine whether we need to look at di_flushiter
+         * to skip replay when the on disk inode is newer than the log one
+         */
+        if (!xfs_sb_version_hascrc(&mp->m_sb) &&
+            dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
                /*
                 * Deal with the wrap case, DI_MAX_FLUSH is less
                 * than smaller numbers
@@ -2594,6 +2616,7 @@ xlog_recover_inode_pass2(
                        goto error;
                }
        }
        /* Take the opportunity to reset the flush iteration count */
        dicp->di_flushiter = 0;
@@ -2982,6 +3005,93 @@ xlog_recover_efd_pass2(
 }
 /*
+ * This routine is called when an inode create format structure is found in a
+ * committed transaction in the log.  It's purpose is to initialise the inodes
+ * being allocated on disk. This requires us to get inode cluster buffers that
+ * match the range to be intialised, stamped with inode templates and written
+ * by delayed write so that subsequent modifications will hit the cached buffer
+ * and only need writing out at the end of recovery.
+ */
+STATIC int
+xlog_recover_do_icreate_pass2(
+        struct xlog             *log,
+        struct list_head        *buffer_list,
+        xlog_recover_item_t     *item)
+{
+        struct xfs_mount        *mp = log->l_mp;
+        struct xfs_icreate_log  *icl;
+        xfs_agnumber_t          agno;
+        xfs_agblock_t           agbno;
+        unsigned int            count;
+        unsigned int            isize;
+        xfs_agblock_t           length;
+        icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+        if (icl->icl_type != XFS_LI_ICREATE) {
+                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
+                return EINVAL;
+        }
+        if (icl->icl_size != 1) {
+                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
+                return EINVAL;
+        }
+        agno = be32_to_cpu(icl->icl_ag);
+        if (agno >= mp->m_sb.sb_agcount) {
+                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
+                return EINVAL;
+        }
+        agbno = be32_to_cpu(icl->icl_agbno);
+        if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
+                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
+                return EINVAL;
+        }
+        isize = be32_to_cpu(icl->icl_isize);
+        if (isize != mp->m_sb.sb_inodesize) {
+                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
+                return EINVAL;
+        }
+        count = be32_to_cpu(icl->icl_count);
+        if (!count) {
+                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
+                return EINVAL;
+        }
+        length = be32_to_cpu(icl->icl_length);
+        if (!length || length >= mp->m_sb.sb_agblocks) {
+                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
+                return EINVAL;
+        }
+        /* existing allocation is fixed value */
+        ASSERT(count == XFS_IALLOC_INODES(mp));
+        ASSERT(length == XFS_IALLOC_BLOCKS(mp));
+        if (count != XFS_IALLOC_INODES(mp) ||
+             length != XFS_IALLOC_BLOCKS(mp)) {
+                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+                return EINVAL;
+        }
+        /*
+         * Inode buffers can be freed. Do not replay the inode initialisation as
+         * we could be overwriting something written after this inode buffer was
+         * cancelled.
+         *
+         * XXX: we need to iterate all buffers and only init those that are not
+         * cancelled. I think that a more fine grained factoring of
+         * xfs_ialloc_inode_init may be appropriate here to enable this to be
+         * done easily.
+         */
+        if (xlog_check_buffer_cancelled(log,
+                        XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+                return 0;
+        xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
+                                        be32_to_cpu(icl->icl_gen));
+        return 0;
+}
+/*
 * Free up any resources allocated by the transaction
 *
 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
@@ -3023,6 +3133,7 @@ xlog_recover_commit_pass1(
        case XFS_LI_EFI:
        case XFS_LI_EFD:
        case XFS_LI_DQUOT:
+        case XFS_LI_ICREATE:
                /* nothing to do in pass 1 */
                return 0;
        default:
@@ -3053,6 +3164,8 @@ xlog_recover_commit_pass2(
                return xlog_recover_efd_pass2(log, item);
        case XFS_LI_DQUOT:
                return xlog_recover_dquot_pass2(log, buffer_list, item);
+        case XFS_LI_ICREATE:
+                return xlog_recover_do_icreate_pass2(log, buffer_list, item);
        case XFS_LI_QUOTAOFF:
                /* nothing to do in pass2 */
                return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e8e310c05097..2b0ba3581656 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -336,6 +336,14 @@ xfs_mount_validate_sb(
                return XFS_ERROR(EWRONGFS);
        }
+        if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) &&
+                        (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+                                XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) {
+                xfs_notice(mp,
+"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n");
+                return XFS_ERROR(EFSCORRUPTED);
+        }
        /*
         * Version 5 superblock feature mask validation. Reject combinations the
         * kernel cannot support up front before checking anything else. For
@@ -561,6 +569,18 @@ out_unwind:
        return error;
 }
+static void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+        if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+                sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+                                        XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+        if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+                sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+                                        XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+        sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+}
 void
 xfs_sb_from_disk(
        struct xfs_sb   *to,
@@ -622,6 +642,35 @@ xfs_sb_from_disk(
        to->sb_lsn = be64_to_cpu(from->sb_lsn);
 }
+static inline void
+xfs_sb_quota_to_disk(
+        xfs_dsb_t       *to,
+        xfs_sb_t        *from,
+        __int64_t       *fields)
+{
+        __uint16_t      qflags = from->sb_qflags;
+        if (*fields & XFS_SB_QFLAGS) {
+                /*
+                 * The in-core version of sb_qflags do not have
+                 * XFS_OQUOTA_* flags, whereas the on-disk version
+                 * does.  So, convert incore XFS_{PG}QUOTA_* flags
+                 * to on-disk XFS_OQUOTA_* flags.
+                 */
+                qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+                                XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+                if (from->sb_qflags &
+                                (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+                        qflags |= XFS_OQUOTA_ENFD;
+                if (from->sb_qflags &
+                                (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+                        qflags |= XFS_OQUOTA_CHKD;
+                to->sb_qflags = cpu_to_be16(qflags);
+                *fields &= ~XFS_SB_QFLAGS;
+        }
+}
 /*
 * Copy in core superblock to ondisk one.
 *
@@ -643,6 +692,7 @@ xfs_sb_to_disk(
        if (!fields)
                return;
+        xfs_sb_quota_to_disk(to, from, &fields);
        while (fields) {
                f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
                first = xfs_sb_info[f].offset;
@@ -835,6 +885,7 @@ reread:
         */
        xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+        xfs_sb_quota_from_disk(&mp->m_sb);
        /*
         * We must be able to do sector-sized and sector-aligned IO.
         */
@@ -987,42 +1038,27 @@ xfs_update_alignment(xfs_mount_t *mp)
                 */
                if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
                    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
-                        if (mp->m_flags & XFS_MOUNT_RETERR) {
+                        xfs_warn(mp,
-                                xfs_warn(mp, "alignment check failed: "
+                "alignment check failed: sunit/swidth vs. blocksize(%d)",
-                                         "(sunit/swidth vs. blocksize)");
+                                sbp->sb_blocksize);
-                                return XFS_ERROR(EINVAL);
+                        return XFS_ERROR(EINVAL);
-                        }
-                        mp->m_dalign = mp->m_swidth = 0;
                } else {
                        /*
                         * Convert the stripe unit and width to FSBs.
                         */
                        mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
                        if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
-                                if (mp->m_flags & XFS_MOUNT_RETERR) {
-                                        xfs_warn(mp, "alignment check failed: "
-                                                 "(sunit/swidth vs. ag size)");
-                                        return XFS_ERROR(EINVAL);
-                                }
                                xfs_warn(mp,
-                "stripe alignment turned off: sunit(%d)/swidth(%d) "
+                        "alignment check failed: sunit/swidth vs. agsize(%d)",
-                "incompatible with agsize(%d)",
+                                         sbp->sb_agblocks);
-                                        mp->m_dalign, mp->m_swidth,
+                                return XFS_ERROR(EINVAL);
-                                        sbp->sb_agblocks);
-                                mp->m_dalign = 0;
-                                mp->m_swidth = 0;
                        } else if (mp->m_dalign) {
                                mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
                        } else {
-                                if (mp->m_flags & XFS_MOUNT_RETERR) {
+                                xfs_warn(mp,
-                                        xfs_warn(mp, "alignment check failed: "
+                        "alignment check failed: sunit(%d) less than bsize(%d)",
-                                                "sunit(%d) less than bsize(%d)",
+                                         mp->m_dalign, sbp->sb_blocksize);
-                                                mp->m_dalign,
+                                return XFS_ERROR(EINVAL);
-                                                mp->m_blockmask +1);
-                                        return XFS_ERROR(EINVAL);
-                                }
-                                mp->m_swidth = 0;
                        }
                }
@@ -1039,6 +1075,10 @@ xfs_update_alignment(xfs_mount_t *mp)
                                sbp->sb_width = mp->m_swidth;
                                mp->m_update_flags |= XFS_SB_WIDTH;
                        }
+                } else {
+                        xfs_warn(mp,
+        "cannot change alignment: superblock does not support data alignment");
+                        return XFS_ERROR(EINVAL);
                }
        } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
                    xfs_sb_version_hasdalign(&mp->m_sb)) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b004cecdfb04..4e374d4a9189 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -192,8 +192,6 @@ typedef struct xfs_mount {
        xfs_dablk_t             m_dirleafblk;   /* blockno of dir non-data v2 */
        xfs_dablk_t             m_dirfreeblk;   /* blockno of dirfreeindex v2 */
        uint                    m_chsize;       /* size of next field */
-        struct xfs_chash        *m_chash;       /* fs private inode per-cluster
-                                                 * hash table */
        atomic_t                m_active_trans; /* number trans frozen */
 #ifdef HAVE_PERCPU_SB
        xfs_icsb_cnts_t __percpu *m_sb_cnts;    /* per-cpu superblock counters */
@@ -229,8 +227,6 @@ typedef struct xfs_mount {
                                                   operations, typically for
                                                   disk errors in metadata */
 #define XFS_MOUNT_DISCARD       (1ULL << 5)     /* discard unused blocks */
-#define XFS_MOUNT_RETERR        (1ULL << 6)     /* return alignment errors to
-                                                   user */
 #define XFS_MOUNT_NOALIGN       (1ULL << 7)     /* turn off stripe alignment
                                                   allocations */
 #define XFS_MOUNT_ATTR2         (1ULL << 8)     /* allow use of attr2 format */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b75c9bb6e71e..d320794d03ce 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -70,7 +70,7 @@ xfs_qm_dquot_walk(
        void                    *data)
 {
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
-        struct radix_tree_root  *tree = XFS_DQUOT_TREE(qi, type);
+        struct radix_tree_root  *tree = xfs_dquot_tree(qi, type);
        uint32_t                next_index;
        int                     last_error = 0;
        int                     skipped;
@@ -137,6 +137,7 @@ xfs_qm_dqpurge(
        struct xfs_mount        *mp = dqp->q_mount;
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
        struct xfs_dquot        *gdqp = NULL;
+        struct xfs_dquot        *pdqp = NULL;
        xfs_dqlock(dqp);
        if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
@@ -145,8 +146,7 @@ xfs_qm_dqpurge(
        }
        /*
-         * If this quota has a group hint attached, prepare for releasing it
+         * If this quota has a hint attached, prepare for releasing it now.
-         * now.
         */
        gdqp = dqp->q_gdquot;
        if (gdqp) {
@@ -154,6 +154,12 @@ xfs_qm_dqpurge(
                dqp->q_gdquot = NULL;
        }
+        pdqp = dqp->q_pdquot;
+        if (pdqp) {
+                xfs_dqlock(pdqp);
+                dqp->q_pdquot = NULL;
+        }
        dqp->dq_flags |= XFS_DQ_FREEING;
        xfs_dqflock(dqp);
@@ -189,7 +195,7 @@ xfs_qm_dqpurge(
        xfs_dqfunlock(dqp);
        xfs_dqunlock(dqp);
-        radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+        radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
                          be32_to_cpu(dqp->q_core.d_id));
        qi->qi_dquots--;
@@ -208,6 +214,8 @@ xfs_qm_dqpurge(
        if (gdqp)
                xfs_qm_dqput(gdqp);
+        if (pdqp)
+                xfs_qm_dqput(pdqp);
        return 0;
 }
@@ -299,8 +307,10 @@ xfs_qm_mount_quotas(
         */
        if (!XFS_IS_UQUOTA_ON(mp))
                mp->m_qflags &= ~XFS_UQUOTA_CHKD;
-        if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
+        if (!XFS_IS_GQUOTA_ON(mp))
-                mp->m_qflags &= ~XFS_OQUOTA_CHKD;
+                mp->m_qflags &= ~XFS_GQUOTA_CHKD;
+        if (!XFS_IS_PQUOTA_ON(mp))
+                mp->m_qflags &= ~XFS_PQUOTA_CHKD;
 write_changes:
        /*
@@ -362,6 +372,10 @@ xfs_qm_unmount_quotas(
                        IRELE(mp->m_quotainfo->qi_gquotaip);
                        mp->m_quotainfo->qi_gquotaip = NULL;
                }
+                if (mp->m_quotainfo->qi_pquotaip) {
+                        IRELE(mp->m_quotainfo->qi_pquotaip);
+                        mp->m_quotainfo->qi_pquotaip = NULL;
+                }
        }
 }
@@ -408,7 +422,10 @@ xfs_qm_dqattach_one(
                 * be reclaimed as long as we have a ref from inode and we
                 * hold the ilock.
                 */
-                dqp = udqhint->q_gdquot;
+                if (type == XFS_DQ_GROUP)
+                        dqp = udqhint->q_gdquot;
+                else
+                        dqp = udqhint->q_pdquot;
                if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
                        ASSERT(*IO_idqpp == NULL);
@@ -451,28 +468,42 @@ xfs_qm_dqattach_one(
 /*
- * Given a udquot and gdquot, attach a ptr to the group dquot in the
+ * Given a udquot and group/project type, attach the group/project
- * udquot as a hint for future lookups.
+ * dquot pointer to the udquot as a hint for future lookups.
 */
 STATIC void
-xfs_qm_dqattach_grouphint(
+xfs_qm_dqattach_hint(
-        xfs_dquot_t     *udq,
+        struct xfs_inode        *ip,
-        xfs_dquot_t     *gdq)
+        int                     type)
 {
-        xfs_dquot_t     *tmp;
+        struct xfs_dquot **dqhintp;
+        struct xfs_dquot *dqp;
+        struct xfs_dquot *udq = ip->i_udquot;
+        ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
        xfs_dqlock(udq);
-        tmp = udq->q_gdquot;
+        if (type == XFS_DQ_GROUP) {
-        if (tmp) {
+                dqp = ip->i_gdquot;
-                if (tmp == gdq)
+                dqhintp = &udq->q_gdquot;
+        } else {
+                dqp = ip->i_pdquot;
+                dqhintp = &udq->q_pdquot;
+        }
+        if (*dqhintp) {
+                struct xfs_dquot *tmp;
+                if (*dqhintp == dqp)
                        goto done;
-                udq->q_gdquot = NULL;
+                tmp = *dqhintp;
+                *dqhintp = NULL;
                xfs_qm_dqrele(tmp);
        }
-        udq->q_gdquot = xfs_qm_dqhold(gdq);
+        *dqhintp = xfs_qm_dqhold(dqp);
 done:
        xfs_dqunlock(udq);
 }
@@ -489,8 +520,7 @@ xfs_qm_need_dqattach(
                return false;
        if (!XFS_NOT_DQATTACHED(mp, ip))
                return false;
-        if (ip->i_ino == mp->m_sb.sb_uquotino ||
+        if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
-            ip->i_ino == mp->m_sb.sb_gquotino)
                return false;
        return true;
 }
@@ -526,12 +556,8 @@ xfs_qm_dqattach_locked(
        }
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        if (XFS_IS_OQUOTA_ON(mp)) {
+        if (XFS_IS_GQUOTA_ON(mp)) {
-                error = XFS_IS_GQUOTA_ON(mp) ?
+                error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
-                        xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
-                                                flags & XFS_QMOPT_DQALLOC,
-                                                ip->i_udquot, &ip->i_gdquot) :
-                        xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
                                                flags & XFS_QMOPT_DQALLOC,
                                                ip->i_udquot, &ip->i_gdquot);
                /*
@@ -543,14 +569,28 @@ xfs_qm_dqattach_locked(
                nquotas++;
        }
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        if (XFS_IS_PQUOTA_ON(mp)) {
+                error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
+                                                flags & XFS_QMOPT_DQALLOC,
+                                                ip->i_udquot, &ip->i_pdquot);
+                /*
+                 * Don't worry about the udquot that we may have
+                 * attached above. It'll get detached, if not already.
+                 */
+                if (error)
+                        goto done;
+                nquotas++;
+        }
        /*
-         * Attach this group quota to the user quota as a hint.
+         * Attach this group/project quota to the user quota as a hint.
         * This WON'T, in general, result in a thrash.
         */
-        if (nquotas == 2) {
+        if (nquotas > 1 && ip->i_udquot) {
                ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-                ASSERT(ip->i_udquot);
+                ASSERT(ip->i_gdquot || !XFS_IS_GQUOTA_ON(mp));
-                ASSERT(ip->i_gdquot);
+                ASSERT(ip->i_pdquot || !XFS_IS_PQUOTA_ON(mp));
                /*
                 * We do not have i_udquot locked at this point, but this check
@@ -559,7 +599,10 @@ xfs_qm_dqattach_locked(
                 * succeed in general.
                 */
                if (ip->i_udquot->q_gdquot != ip->i_gdquot)
-                        xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
+                        xfs_qm_dqattach_hint(ip, XFS_DQ_GROUP);
+                if (ip->i_udquot->q_pdquot != ip->i_pdquot)
+                        xfs_qm_dqattach_hint(ip, XFS_DQ_PROJ);
        }
 done:
@@ -567,8 +610,10 @@ xfs_qm_dqattach_locked(
        if (!error) {
                if (XFS_IS_UQUOTA_ON(mp))
                        ASSERT(ip->i_udquot);
-                if (XFS_IS_OQUOTA_ON(mp))
+                if (XFS_IS_GQUOTA_ON(mp))
                        ASSERT(ip->i_gdquot);
+                if (XFS_IS_PQUOTA_ON(mp))
+                        ASSERT(ip->i_pdquot);
        }
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 #endif
@@ -601,13 +646,12 @@ void
 xfs_qm_dqdetach(
        xfs_inode_t     *ip)
 {
-        if (!(ip->i_udquot || ip->i_gdquot))
+        if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot))
                return;
        trace_xfs_dquot_dqdetach(ip);
-        ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
+        ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino));
-        ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
        if (ip->i_udquot) {
                xfs_qm_dqrele(ip->i_udquot);
                ip->i_udquot = NULL;
@@ -616,6 +660,10 @@ xfs_qm_dqdetach(
                xfs_qm_dqrele(ip->i_gdquot);
                ip->i_gdquot = NULL;
        }
+        if (ip->i_pdquot) {
+                xfs_qm_dqrele(ip->i_pdquot);
+                ip->i_pdquot = NULL;
+        }
 }
 int
@@ -660,6 +708,7 @@ xfs_qm_init_quotainfo(
        INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
        INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
+        INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
        mutex_init(&qinf->qi_tree_lock);
        INIT_LIST_HEAD(&qinf->qi_lru_list);
@@ -761,6 +810,10 @@ xfs_qm_destroy_quotainfo(
                IRELE(qi->qi_gquotaip);
                qi->qi_gquotaip = NULL;
        }
+        if (qi->qi_pquotaip) {
+                IRELE(qi->qi_pquotaip);
+                qi->qi_pquotaip = NULL;
+        }
        mutex_destroy(&qi->qi_quotaofflock);
        kmem_free(qi);
        mp->m_quotainfo = NULL;
@@ -1152,7 +1205,7 @@ xfs_qm_dqusage_adjust(
         * rootino must have its resources accounted for, not so with the quota
         * inodes.
         */
-        if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+        if (xfs_is_quota_inode(&mp->m_sb, ino)) {
                *res = BULKSTAT_RV_NOTHING;
                return XFS_ERROR(EINVAL);
        }
@@ -1262,19 +1315,21 @@ int
 xfs_qm_quotacheck(
        xfs_mount_t     *mp)
 {
-        int             done, count, error, error2;
+        int                     done, count, error, error2;
-        xfs_ino_t       lastino;
+        xfs_ino_t               lastino;
-        size_t          structsz;
+        size_t                  structsz;
-        xfs_inode_t     *uip, *gip;
+        uint                    flags;
-        uint            flags;
+        LIST_HEAD               (buffer_list);
-        LIST_HEAD       (buffer_list);
+        struct xfs_inode        *uip = mp->m_quotainfo->qi_uquotaip;
+        struct xfs_inode        *gip = mp->m_quotainfo->qi_gquotaip;
+        struct xfs_inode        *pip = mp->m_quotainfo->qi_pquotaip;
        count = INT_MAX;
        structsz = 1;
        lastino = 0;
        flags = 0;
-        ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
+        ASSERT(uip || gip || pip);
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        xfs_notice(mp, "Quotacheck needed: Please wait.");
@@ -1284,7 +1339,6 @@ xfs_qm_quotacheck(
         * their counters to zero. We need a clean slate.
         * We don't log our changes till later.
         */
-        uip = mp->m_quotainfo->qi_uquotaip;
        if (uip) {
                error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
                                         &buffer_list);
@@ -1293,14 +1347,20 @@ xfs_qm_quotacheck(
                flags |= XFS_UQUOTA_CHKD;
        }
-        gip = mp->m_quotainfo->qi_gquotaip;
        if (gip) {
-                error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+                error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA,
-                                         XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
                                         &buffer_list);
                if (error)
                        goto error_return;
-                flags |= XFS_OQUOTA_CHKD;
+                flags |= XFS_GQUOTA_CHKD;
+        }
+        if (pip) {
+                error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA,
+                                         &buffer_list);
+                if (error)
+                        goto error_return;
+                flags |= XFS_PQUOTA_CHKD;
        }
        do {
@@ -1395,15 +1455,14 @@ STATIC int
 xfs_qm_init_quotainos(
        xfs_mount_t     *mp)
 {
-        xfs_inode_t     *uip, *gip;
+        struct xfs_inode        *uip = NULL;
-        int             error;
+        struct xfs_inode        *gip = NULL;
-        __int64_t       sbflags;
+        struct xfs_inode        *pip = NULL;
-        uint            flags;
+        int                     error;
+        __int64_t               sbflags = 0;
+        uint                    flags = 0;
        ASSERT(mp->m_quotainfo);
-        uip = gip = NULL;
-        sbflags = 0;
-        flags = 0;
        /*
         * Get the uquota and gquota inodes
@@ -1412,19 +1471,27 @@ xfs_qm_init_quotainos(
                if (XFS_IS_UQUOTA_ON(mp) &&
                    mp->m_sb.sb_uquotino != NULLFSINO) {
                        ASSERT(mp->m_sb.sb_uquotino > 0);
-                        if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+                        error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
-                                             0, 0, &uip)))
+                                             0, 0, &uip);
+                        if (error)
                                return XFS_ERROR(error);
                }
-                if (XFS_IS_OQUOTA_ON(mp) &&
+                if (XFS_IS_GQUOTA_ON(mp) &&
                    mp->m_sb.sb_gquotino != NULLFSINO) {
                        ASSERT(mp->m_sb.sb_gquotino > 0);
-                        if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+                        error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
-                                             0, 0, &gip))) {
+                                             0, 0, &gip);
-                                if (uip)
+                        if (error)
-                                        IRELE(uip);
+                                goto error_rele;
-                                return XFS_ERROR(error);
+                }
-                        }
+                /* XXX: Use gquotino for now */
+                if (XFS_IS_PQUOTA_ON(mp) &&
+                    mp->m_sb.sb_gquotino != NULLFSINO) {
+                        ASSERT(mp->m_sb.sb_gquotino > 0);
+                        error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+                                             0, 0, &pip);
+                        if (error)
+                                goto error_rele;
                }
        } else {
                flags |= XFS_QMOPT_SBVERSION;
@@ -1433,36 +1500,52 @@ xfs_qm_init_quotainos(
        }
        /*
-         * Create the two inodes, if they don't exist already. The changes
+         * Create the three inodes, if they don't exist already. The changes
         * made above will get added to a transaction and logged in one of
         * the qino_alloc calls below.  If the device is readonly,
         * temporarily switch to read-write to do this.
         */
        if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
-                if ((error = xfs_qm_qino_alloc(mp, &uip,
+                error = xfs_qm_qino_alloc(mp, &uip,
                                              sbflags | XFS_SB_UQUOTINO,
-                                              flags | XFS_QMOPT_UQUOTA)))
+                                              flags | XFS_QMOPT_UQUOTA);
-                        return XFS_ERROR(error);
+                if (error)
+                        goto error_rele;
                flags &= ~XFS_QMOPT_SBVERSION;
        }
-        if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) {
+        if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
-                flags |= (XFS_IS_GQUOTA_ON(mp) ?
-                                XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
                error = xfs_qm_qino_alloc(mp, &gip,
-                                          sbflags | XFS_SB_GQUOTINO, flags);
+                                          sbflags | XFS_SB_GQUOTINO,
-                if (error) {
+                                          flags | XFS_QMOPT_GQUOTA);
-                        if (uip)
+                if (error)
-                                IRELE(uip);
+                        goto error_rele;
-                        return XFS_ERROR(error);
+                flags &= ~XFS_QMOPT_SBVERSION;
-                }
+        }
+        if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
+                /* XXX: Use XFS_SB_GQUOTINO for now */
+                error = xfs_qm_qino_alloc(mp, &pip,
+                                          sbflags | XFS_SB_GQUOTINO,
+                                          flags | XFS_QMOPT_PQUOTA);
+                if (error)
+                        goto error_rele;
        }
        mp->m_quotainfo->qi_uquotaip = uip;
        mp->m_quotainfo->qi_gquotaip = gip;
+        mp->m_quotainfo->qi_pquotaip = pip;
        return 0;
+error_rele:
+        if (uip)
+                IRELE(uip);
+        if (gip)
+                IRELE(gip);
+        if (pip)
+                IRELE(pip);
+        return XFS_ERROR(error);
 }
 STATIC void
@@ -1473,7 +1556,7 @@ xfs_qm_dqfree_one(
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
        mutex_lock(&qi->qi_tree_lock);
-        radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+        radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
                          be32_to_cpu(dqp->q_core.d_id));
        qi->qi_dquots--;
@@ -1656,10 +1739,13 @@ xfs_qm_vop_dqalloc(
        prid_t                  prid,
        uint                    flags,
        struct xfs_dquot        **O_udqpp,
-        struct xfs_dquot        **O_gdqpp)
+        struct xfs_dquot        **O_gdqpp,
+        struct xfs_dquot        **O_pdqpp)
 {
        struct xfs_mount        *mp = ip->i_mount;
-        struct xfs_dquot        *uq, *gq;
+        struct xfs_dquot        *uq = NULL;
+        struct xfs_dquot        *gq = NULL;
+        struct xfs_dquot        *pq = NULL;
        int                     error;
        uint                    lockflags;
@@ -1684,7 +1770,6 @@ xfs_qm_vop_dqalloc(
                }
        }
-        uq = gq = NULL;
        if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
                if (ip->i_d.di_uid != uid) {
                        /*
@@ -1697,11 +1782,12 @@ xfs_qm_vop_dqalloc(
                         * holding ilock.
                         */
                        xfs_iunlock(ip, lockflags);
-                        if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+                        error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
                                                 XFS_DQ_USER,
                                                 XFS_QMOPT_DQALLOC |
                                                 XFS_QMOPT_DOWARN,
-                                                 &uq))) {
+                                                 &uq);
+                        if (error) {
                                ASSERT(error != ENOENT);
                                return error;
                        }
@@ -1723,15 +1809,14 @@ xfs_qm_vop_dqalloc(
        if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
                if (ip->i_d.di_gid != gid) {
                        xfs_iunlock(ip, lockflags);
-                        if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+                        error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
                                                 XFS_DQ_GROUP,
                                                 XFS_QMOPT_DQALLOC |
                                                 XFS_QMOPT_DOWARN,
-                                                 &gq))) {
+                                                 &gq);
-                                if (uq)
+                        if (error) {
-                                        xfs_qm_dqrele(uq);
                                ASSERT(error != ENOENT);
-                                return error;
+                                goto error_rele;
                        }
                        xfs_dqunlock(gq);
                        lockflags = XFS_ILOCK_SHARED;
@@ -1740,25 +1825,25 @@ xfs_qm_vop_dqalloc(
                        ASSERT(ip->i_gdquot);
                        gq = xfs_qm_dqhold(ip->i_gdquot);
                }
-        } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
+        }
+        if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
                if (xfs_get_projid(ip) != prid) {
                        xfs_iunlock(ip, lockflags);
-                        if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
+                        error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
                                                 XFS_DQ_PROJ,
                                                 XFS_QMOPT_DQALLOC |
                                                 XFS_QMOPT_DOWARN,
-                                                 &gq))) {
+                                                 &pq);
-                                if (uq)
+                        if (error) {
-                                        xfs_qm_dqrele(uq);
                                ASSERT(error != ENOENT);
-                                return (error);
+                                goto error_rele;
                        }
-                        xfs_dqunlock(gq);
+                        xfs_dqunlock(pq);
                        lockflags = XFS_ILOCK_SHARED;
                        xfs_ilock(ip, lockflags);
                } else {
-                        ASSERT(ip->i_gdquot);
+                        ASSERT(ip->i_pdquot);
-                        gq = xfs_qm_dqhold(ip->i_gdquot);
+                        pq = xfs_qm_dqhold(ip->i_pdquot);
                }
        }
        if (uq)
@@ -1773,7 +1858,18 @@ xfs_qm_vop_dqalloc(
                *O_gdqpp = gq;
        else if (gq)
                xfs_qm_dqrele(gq);
+        if (O_pdqpp)
+                *O_pdqpp = pq;
+        else if (pq)
+                xfs_qm_dqrele(pq);
        return 0;
+error_rele:
+        if (gq)
+                xfs_qm_dqrele(gq);
+        if (uq)
+                xfs_qm_dqrele(uq);
+        return error;
 }
 /*
@@ -1821,29 +1917,34 @@ xfs_qm_vop_chown(
 */
 int
 xfs_qm_vop_chown_reserve(
-        xfs_trans_t     *tp,
+        struct xfs_trans        *tp,
-        xfs_inode_t     *ip,
+        struct xfs_inode        *ip,
-        xfs_dquot_t     *udqp,
+        struct xfs_dquot        *udqp,
-        xfs_dquot_t     *gdqp,
+        struct xfs_dquot        *gdqp,
-        uint            flags)
+        struct xfs_dquot        *pdqp,
+        uint                    flags)
 {
-        xfs_mount_t     *mp = ip->i_mount;
+        struct xfs_mount        *mp = ip->i_mount;
-        uint            delblks, blkflags, prjflags = 0;
+        uint                    delblks, blkflags, prjflags = 0;
-        xfs_dquot_t     *unresudq, *unresgdq, *delblksudq, *delblksgdq;
+        struct xfs_dquot        *udq_unres = NULL;
-        int             error;
+        struct xfs_dquot        *gdq_unres = NULL;
+        struct xfs_dquot        *pdq_unres = NULL;
+        struct xfs_dquot        *udq_delblks = NULL;
+        struct xfs_dquot        *gdq_delblks = NULL;
+        struct xfs_dquot        *pdq_delblks = NULL;
+        int                     error;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        delblks = ip->i_delayed_blks;
-        delblksudq = delblksgdq = unresudq = unresgdq = NULL;
        blkflags = XFS_IS_REALTIME_INODE(ip) ?
                        XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
        if (XFS_IS_UQUOTA_ON(mp) && udqp &&
            ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
-                delblksudq = udqp;
+                udq_delblks = udqp;
                /*
                 * If there are delayed allocation blocks, then we have to
                 * unreserve those from the old dquot, and add them to the
@@ -1851,29 +1952,34 @@ xfs_qm_vop_chown_reserve(
                 */
                if (delblks) {
                        ASSERT(ip->i_udquot);
-                        unresudq = ip->i_udquot;
+                        udq_unres = ip->i_udquot;
                }
        }
-        if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
+        if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
-                if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
+            ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) {
-                     xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
+                gdq_delblks = gdqp;
-                        prjflags = XFS_QMOPT_ENOSPC;
+                if (delblks) {
+                        ASSERT(ip->i_gdquot);
-                if (prjflags ||
+                        gdq_unres = ip->i_gdquot;
-                    (XFS_IS_GQUOTA_ON(ip->i_mount) &&
-                     ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
-                        delblksgdq = gdqp;
-                        if (delblks) {
-                                ASSERT(ip->i_gdquot);
-                                unresgdq = ip->i_gdquot;
-                        }
                }
        }
-        if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
+        if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
-                                delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
+            xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) {
-                                flags | blkflags | prjflags)))
+                prjflags = XFS_QMOPT_ENOSPC;
-                return (error);
+                pdq_delblks = pdqp;
+                if (delblks) {
+                        ASSERT(ip->i_pdquot);
+                        pdq_unres = ip->i_pdquot;
+                }
+        }
+        error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
+                                udq_delblks, gdq_delblks, pdq_delblks,
+                                ip->i_d.di_nblocks, 1,
+                                flags | blkflags | prjflags);
+        if (error)
+                return error;
        /*
         * Do the delayed blks reservations/unreservations now. Since, these
@@ -1885,15 +1991,17 @@ xfs_qm_vop_chown_reserve(
                /*
                 * Do the reservations first. Unreservation can't fail.
                 */
-                ASSERT(delblksudq || delblksgdq);
+                ASSERT(udq_delblks || gdq_delblks || pdq_delblks);
-                ASSERT(unresudq || unresgdq);
+                ASSERT(udq_unres || gdq_unres || pdq_unres);
-                if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+                error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
-                                delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
+                            udq_delblks, gdq_delblks, pdq_delblks,
-                                flags | blkflags | prjflags)))
+                            (xfs_qcnt_t)delblks, 0,
-                        return (error);
+                            flags | blkflags | prjflags);
+                if (error)
+                        return error;
                xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
-                                unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
+                                udq_unres, gdq_unres, pdq_unres,
-                                blkflags);
+                                -((xfs_qcnt_t)delblks), 0, blkflags);
        }
        return (0);
@@ -1932,7 +2040,8 @@ xfs_qm_vop_create_dqattach(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
        struct xfs_dquot        *udqp,
-        struct xfs_dquot        *gdqp)
+        struct xfs_dquot        *gdqp,
+        struct xfs_dquot        *pdqp)
 {
        struct xfs_mount        *mp = tp->t_mountp;
@@ -1952,13 +2061,18 @@ xfs_qm_vop_create_dqattach(
        }
        if (gdqp) {
                ASSERT(ip->i_gdquot == NULL);
-                ASSERT(XFS_IS_OQUOTA_ON(mp));
+                ASSERT(XFS_IS_GQUOTA_ON(mp));
-                ASSERT((XFS_IS_GQUOTA_ON(mp) ?
+                ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
-                        ip->i_d.di_gid : xfs_get_projid(ip)) ==
-                                be32_to_cpu(gdqp->q_core.d_id));
                ip->i_gdquot = xfs_qm_dqhold(gdqp);
                xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
        }
+        if (pdqp) {
+                ASSERT(ip->i_pdquot == NULL);
+                ASSERT(XFS_IS_PQUOTA_ON(mp));
+                ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
+                ip->i_pdquot = xfs_qm_dqhold(pdqp);
+                xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
+        }
 }
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 5d16a6e6900f..579d6a02a5b6 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -44,9 +44,11 @@ extern struct kmem_zone	*xfs_qm_dqtrxzone;
 typedef struct xfs_quotainfo {
        struct radix_tree_root qi_uquota_tree;
        struct radix_tree_root qi_gquota_tree;
+        struct radix_tree_root qi_pquota_tree;
        struct mutex qi_tree_lock;
-        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
+        struct xfs_inode        *qi_uquotaip;   /* user quota inode */
-        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
+        struct xfs_inode        *qi_gquotaip;   /* group quota inode */
+        struct xfs_inode        *qi_pquotaip;   /* project quota inode */
        struct list_head qi_lru_list;
        struct mutex     qi_lru_lock;
        int              qi_lru_count;
@@ -69,30 +71,66 @@ typedef struct xfs_quotainfo {
        struct shrinker  qi_shrinker;
 } xfs_quotainfo_t;
-#define XFS_DQUOT_TREE(qi, type) \
+static inline struct radix_tree_root *
-        ((type & XFS_DQ_USER) ? \
+xfs_dquot_tree(
-         &((qi)->qi_uquota_tree) : \
+        struct xfs_quotainfo    *qi,
-         &((qi)->qi_gquota_tree))
+        int                     type)
+{
+        switch (type) {
+        case XFS_DQ_USER:
+                return &qi->qi_uquota_tree;
+        case XFS_DQ_GROUP:
+                return &qi->qi_gquota_tree;
+        case XFS_DQ_PROJ:
+                return &qi->qi_pquota_tree;
+        default:
+                ASSERT(0);
+        }
+        return NULL;
+}
+static inline struct xfs_inode *
+xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
+{
+        switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
+        case XFS_DQ_USER:
+                return dqp->q_mount->m_quotainfo->qi_uquotaip;
+        case XFS_DQ_GROUP:
+                return dqp->q_mount->m_quotainfo->qi_gquotaip;
+        case XFS_DQ_PROJ:
+                return dqp->q_mount->m_quotainfo->qi_pquotaip;
+        default:
+                ASSERT(0);
+        }
+        return NULL;
+}
 extern int      xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp,
                                             unsigned int nbblks);
-extern void     xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
+extern void     xfs_trans_mod_dquot(struct xfs_trans *,
-extern int      xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
+                                        struct xfs_dquot *, uint, long);
-                        xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
+extern int      xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
-extern void     xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
+                        struct xfs_mount *, struct xfs_dquot *,
-extern void     xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
+                        struct xfs_dquot *, struct xfs_dquot *,
+                        long, long, uint);
+extern void     xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
+extern void     xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
 /*
- * We keep the usr and grp dquots separately so that locking will be easier
+ * We keep the usr, grp, and prj dquots separately so that locking will be
- * to do at commit time. All transactions that we know of at this point
+ * easier to do at commit time. All transactions that we know of at this point
 * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
 */
+enum {
+        XFS_QM_TRANS_USR = 0,
+        XFS_QM_TRANS_GRP,
+        XFS_QM_TRANS_PRJ,
+        XFS_QM_TRANS_DQTYPES
+};
 #define XFS_QM_TRANS_MAXDQS             2
-typedef struct xfs_dquot_acct {
+struct xfs_dquot_acct {
-        xfs_dqtrx_t     dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
+        struct xfs_dqtrx        dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
-        xfs_dqtrx_t     dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
+};
-} xfs_dquot_acct_t;
 /*
 * Users are allowed to have a usage exceeding their softlimit for
@@ -106,22 +144,23 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_IWARNLIMIT       5
 #define XFS_QM_RTBWARNLIMIT     5
-extern void             xfs_qm_destroy_quotainfo(xfs_mount_t *);
+extern void             xfs_qm_destroy_quotainfo(struct xfs_mount *);
-extern int              xfs_qm_quotacheck(xfs_mount_t *);
+extern int              xfs_qm_quotacheck(struct xfs_mount *);
-extern int              xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
+extern int              xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
 /* dquot stuff */
-extern void             xfs_qm_dqpurge_all(xfs_mount_t *, uint);
+extern void             xfs_qm_dqpurge_all(struct xfs_mount *, uint);
-extern void             xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
+extern void             xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
 /* quota ops */
-extern int              xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
+extern int              xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
-extern int              xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
+extern int              xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
-                                        fs_disk_quota_t *);
+                                        uint, struct fs_disk_quota *);
 extern int              xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
-                                        fs_disk_quota_t *);
+                                        struct fs_disk_quota *);
-extern int              xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
+extern int              xfs_qm_scall_getqstat(struct xfs_mount *,
-extern int              xfs_qm_scall_quotaon(xfs_mount_t *, uint);
+                                        struct fs_quota_stat *);
-extern int              xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
+extern int              xfs_qm_scall_quotaon(struct xfs_mount *, uint);
+extern int              xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
 #endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 2d02eac1c9a8..437a52d91f6d 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -112,16 +112,16 @@ xfs_qm_newmount(
        if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
            (!uquotaondisk &&  XFS_IS_UQUOTA_ON(mp)) ||
-             (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
-            (!pquotaondisk &&  XFS_IS_PQUOTA_ON(mp)) ||
             (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
-            (!gquotaondisk &&  XFS_IS_OQUOTA_ON(mp)))  &&
+            (!gquotaondisk &&  XFS_IS_GQUOTA_ON(mp)) ||
+             (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
+            (!pquotaondisk &&  XFS_IS_PQUOTA_ON(mp)))  &&
            xfs_dev_is_read_only(mp, "changing quota state")) {
                xfs_warn(mp, "please mount with%s%s%s%s.",
                        (!quotaondisk ? "out quota" : ""),
                        (uquotaondisk ? " usrquota" : ""),
-                        (pquotaondisk ? " prjquota" : ""),
+                        (gquotaondisk ? " grpquota" : ""),
-                        (gquotaondisk ? " grpquota" : ""));
+                        (pquotaondisk ? " prjquota" : ""));
                return XFS_ERROR(EPERM);
        }
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 6cdf6ffc36a1..e4f8b2d6f38b 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -117,11 +117,12 @@ xfs_qm_scall_quotaoff(
        }
        if (flags & XFS_GQUOTA_ACCT) {
                dqtype |= XFS_QMOPT_GQUOTA;
-                flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+                flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
                inactivate_flags |= XFS_GQUOTA_ACTIVE;
-        } else if (flags & XFS_PQUOTA_ACCT) {
+        }
+        if (flags & XFS_PQUOTA_ACCT) {
                dqtype |= XFS_QMOPT_PQUOTA;
-                flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+                flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD);
                inactivate_flags |= XFS_PQUOTA_ACTIVE;
        }
@@ -198,10 +199,9 @@ xfs_qm_scall_quotaoff(
        }
        /*
-         * If quotas is completely disabled, close shop.
+         * If all quotas are completely turned off, close shop.
         */
-        if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
+        if (mp->m_qflags == 0) {
-            ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
                mutex_unlock(&q->qi_quotaofflock);
                xfs_qm_destroy_quotainfo(mp);
                return (0);
@@ -214,10 +214,14 @@ xfs_qm_scall_quotaoff(
                IRELE(q->qi_uquotaip);
                q->qi_uquotaip = NULL;
        }
-        if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
+        if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) {
                IRELE(q->qi_gquotaip);
                q->qi_gquotaip = NULL;
        }
+        if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) {
+                IRELE(q->qi_pquotaip);
+                q->qi_pquotaip = NULL;
+        }
 out_unlock:
        mutex_unlock(&q->qi_quotaofflock);
@@ -335,14 +339,14 @@ xfs_qm_scall_quotaon(
         * quota acct on ondisk without m_qflags' knowing.
         */
        if (((flags & XFS_UQUOTA_ACCT) == 0 &&
-            (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+             (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
-            (flags & XFS_UQUOTA_ENFD))
+             (flags & XFS_UQUOTA_ENFD)) ||
-            ||
+            ((flags & XFS_GQUOTA_ACCT) == 0 &&
+             (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+             (flags & XFS_GQUOTA_ENFD)) ||
            ((flags & XFS_PQUOTA_ACCT) == 0 &&
-            (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
+             (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
-            (flags & XFS_GQUOTA_ACCT) == 0 &&
+             (flags & XFS_PQUOTA_ENFD))) {
-            (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
-            (flags & XFS_OQUOTA_ENFD))) {
                xfs_debug(mp,
                        "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
                        __func__, flags, mp->m_sb.sb_qflags);
@@ -407,11 +411,11 @@ xfs_qm_scall_getqstat(
        struct fs_quota_stat    *out)
 {
        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        struct xfs_inode        *uip, *gip;
+        struct xfs_inode        *uip = NULL;
-        bool                    tempuqip, tempgqip;
+        struct xfs_inode        *gip = NULL;
+        bool                    tempuqip = false;
+        bool                    tempgqip = false;
-        uip = gip = NULL;
-        tempuqip = tempgqip = false;
        memset(out, 0, sizeof(fs_quota_stat_t));
        out->qs_version = FS_QSTAT_VERSION;
@@ -776,9 +780,12 @@ xfs_qm_scall_getquota(
         * gets turned off. No need to confuse the user level code,
         * so return zeroes in that case.
         */
-        if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) ||
+        if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
-            (!XFS_IS_OQUOTA_ENFORCED(mp) &&
+             dqp->q_core.d_flags == XFS_DQ_USER) ||
-                        (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+            (!XFS_IS_GQUOTA_ENFORCED(mp) &&
+             dqp->q_core.d_flags == XFS_DQ_GROUP) ||
+            (!XFS_IS_PQUOTA_ENFORCED(mp) &&
+             dqp->q_core.d_flags == XFS_DQ_PROJ)) {
                dst->d_btimer = 0;
                dst->d_itimer = 0;
                dst->d_rtbtimer = 0;
@@ -786,8 +793,8 @@ xfs_qm_scall_getquota(
 #ifdef DEBUG
        if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
-             (XFS_IS_OQUOTA_ENFORCED(mp) &&
+             (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
-                        (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
+             (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
            dst->d_id != 0) {
                if ((dst->d_bcount > dst->d_blk_softlimit) &&
                    (dst->d_blk_softlimit > 0)) {
@@ -833,16 +840,16 @@ xfs_qm_export_flags(
        uflags = 0;
        if (flags & XFS_UQUOTA_ACCT)
                uflags |= FS_QUOTA_UDQ_ACCT;
-        if (flags & XFS_PQUOTA_ACCT)
-                uflags |= FS_QUOTA_PDQ_ACCT;
        if (flags & XFS_GQUOTA_ACCT)
                uflags |= FS_QUOTA_GDQ_ACCT;
+        if (flags & XFS_PQUOTA_ACCT)
+                uflags |= FS_QUOTA_PDQ_ACCT;
        if (flags & XFS_UQUOTA_ENFD)
                uflags |= FS_QUOTA_UDQ_ENFD;
-        if (flags & (XFS_OQUOTA_ENFD)) {
+        if (flags & XFS_GQUOTA_ENFD)
-                uflags |= (flags & XFS_GQUOTA_ACCT) ?
+                uflags |= FS_QUOTA_GDQ_ENFD;
-                        FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
+        if (flags & XFS_PQUOTA_ENFD)
-        }
+                uflags |= FS_QUOTA_PDQ_ENFD;
        return (uflags);
 }
@@ -856,9 +863,11 @@ xfs_dqrele_inode(
 {
        /* skip quota inodes */
        if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
-            ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
+            ip == ip->i_mount->m_quotainfo->qi_gquotaip ||
+            ip == ip->i_mount->m_quotainfo->qi_pquotaip) {
                ASSERT(ip->i_udquot == NULL);
                ASSERT(ip->i_gdquot == NULL);
+                ASSERT(ip->i_pdquot == NULL);
                return 0;
        }
@@ -867,10 +876,14 @@ xfs_dqrele_inode(
                xfs_qm_dqrele(ip->i_udquot);
                ip->i_udquot = NULL;
        }
-        if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+        if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
                xfs_qm_dqrele(ip->i_gdquot);
                ip->i_gdquot = NULL;
        }
+        if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) {
+                xfs_qm_dqrele(ip->i_pdquot);
+                ip->i_pdquot = NULL;
+        }
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return 0;
 }
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index c38068f26c55..b14f42c714b6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -108,11 +108,28 @@ typedef struct xfs_dqblk {
        { XFS_DQ_FREEING,       "FREEING" }
 /*
- * In the worst case, when both user and group quotas are on,
+ * We have the possibility of all three quota types being active at once, and
- * we can have a max of three dquots changing in a single transaction.
+ * hence free space modification requires modification of all three current
+ * dquots in a single transaction. For this case we need to have a reservation
+ * of at least 3 dquots.
+ *
+ * However, a chmod operation can change both UID and GID in a single
+ * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
+ * modified. Hence for this case we need to reserve space for at least 4 dquots.
+ *
+ * And in the worst case, there's a rename operation that can be modifying up to
+ * 4 inodes with dquots attached to them. In reality, the only inodes that can
+ * have their dquots modified are the source and destination directory inodes
+ * due to directory name creation and removal. That can require space allocation
+ * and/or freeing on both directory inodes, and hence all three dquots on each
+ * inode can be modified. And if the directories are world writeable, all the
+ * dquots can be unique and so 6 dquots can be modified....
+ *
+ * And, of course, we also need to take into account the dquot log format item
+ * used to describe each dquot.
 */
-#define XFS_DQUOT_LOGRES(mp)    (sizeof(xfs_disk_dquot_t) * 3)
+#define XFS_DQUOT_LOGRES(mp)    \
+        ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
 /*
 * These are the structures used to lay out dquots and quotaoff
@@ -161,30 +178,42 @@ typedef struct xfs_qoff_logformat {
 #define XFS_GQUOTA_ACCT 0x0040  /* group quota accounting ON */
 /*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD 0x0080  /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD 0x0100  /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD 0x0200  /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD 0x0400  /* quotacheck run on project quotas */
+/*
 * Quota Accounting/Enforcement flags
 */
 #define XFS_ALL_QUOTA_ACCT      \
                (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
-#define XFS_ALL_QUOTA_ENFD      (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
+#define XFS_ALL_QUOTA_ENFD      \
-#define XFS_ALL_QUOTA_CHKD      (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
+                (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD      \
+                (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
 #define XFS_IS_QUOTA_RUNNING(mp)        ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
 #define XFS_IS_UQUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_UQUOTA_ACCT)
 #define XFS_IS_PQUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_PQUOTA_ACCT)
 #define XFS_IS_GQUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_GQUOTA_ACCT)
 #define XFS_IS_UQUOTA_ENFORCED(mp)      ((mp)->m_qflags & XFS_UQUOTA_ENFD)
-#define XFS_IS_OQUOTA_ENFORCED(mp)      ((mp)->m_qflags & XFS_OQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp)      ((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp)      ((mp)->m_qflags & XFS_PQUOTA_ENFD)
 /*
 * Incore only flags for quotaoff - these bits get cleared when quota(s)
 * are in the process of getting turned off. These flags are in m_qflags but
 * never in sb_qflags.
 */
-#define XFS_UQUOTA_ACTIVE       0x0100  /* uquotas are being turned off */
+#define XFS_UQUOTA_ACTIVE       0x1000  /* uquotas are being turned off */
-#define XFS_PQUOTA_ACTIVE       0x0200  /* pquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE       0x2000  /* gquotas are being turned off */
-#define XFS_GQUOTA_ACTIVE       0x0400  /* gquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE       0x4000  /* pquotas are being turned off */
 #define XFS_ALL_QUOTA_ACTIVE    \
-        (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
+        (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
 /*
 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
@@ -259,33 +288,24 @@ typedef struct xfs_qoff_logformat {
 * we didn't have the inode locked, the appropriate dquot(s) will be
 * attached atomically.
 */
-#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\
+#define XFS_NOT_DQATTACHED(mp, ip) \
-                                     (ip)->i_udquot == NULL) || \
+        ((XFS_IS_UQUOTA_ON(mp) && (ip)->i_udquot == NULL) || \
-                                    (XFS_IS_OQUOTA_ON(mp) && \
+         (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \
-                                     (ip)->i_gdquot == NULL))
+         (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL))
 #define XFS_QM_NEED_QUOTACHECK(mp) \
        ((XFS_IS_UQUOTA_ON(mp) && \
                (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
         (XFS_IS_GQUOTA_ON(mp) && \
-                ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
+                (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \
-                 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \
         (XFS_IS_PQUOTA_ON(mp) && \
-                ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
+                (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
-                 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT))))
-#define XFS_MOUNT_QUOTA_SET1    (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
-                                 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
-                                 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
-#define XFS_MOUNT_QUOTA_SET2    (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
-                                 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
-                                 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
 #define XFS_MOUNT_QUOTA_ALL     (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
-                                 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
+                                 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
-                                 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\
+                                 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
-                                 XFS_GQUOTA_ACCT)
+                                 XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+                                 XFS_PQUOTA_CHKD)
 /*
@@ -318,17 +338,18 @@ extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
                struct xfs_inode *, long, long, uint);
 extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
                struct xfs_mount *, struct xfs_dquot *,
-                struct xfs_dquot *, long, long, uint);
+                struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
 extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
-                struct xfs_dquot **, struct xfs_dquot **);
+                struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **);
 extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
-                struct xfs_dquot *, struct xfs_dquot *);
+                struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *);
 extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
 extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
                struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
 extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
-                struct xfs_dquot *, struct xfs_dquot *, uint);
+                struct xfs_dquot *, struct xfs_dquot *,
+                struct xfs_dquot *, uint);
 extern int xfs_qm_dqattach(struct xfs_inode *, uint);
 extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
 extern void xfs_qm_dqdetach(struct xfs_inode *);
@@ -342,10 +363,12 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
 #else
 static inline int
 xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
-                uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
+                uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp,
+                struct xfs_dquot **pdqp)
 {
        *udqp = NULL;
        *gdqp = NULL;
+        *pdqp = NULL;
        return 0;
 }
 #define xfs_trans_dup_dqinfo(tp, tp2)
@@ -360,14 +383,15 @@ static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
 }
 static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
                struct xfs_mount *mp, struct xfs_dquot *udqp,
-                struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+                struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
+                long nblks, long nions, uint flags)
 {
        return 0;
 }
-#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
+#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p)
 #define xfs_qm_vop_rename_dqattach(it)                                  (0)
 #define xfs_qm_vop_chown(tp, ip, old, new)                              (NULL)
-#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl)                      (0)
+#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl)                   (0)
 #define xfs_qm_dqattach(ip, fl)                                         (0)
 #define xfs_qm_dqattach_locked(ip, fl)                                  (0)
 #define xfs_qm_dqdetach(ip)
@@ -381,8 +405,8 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
 #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
        xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
-#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \
+#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \
-        xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
+        xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \
                                f | XFS_QMOPT_RES_REGBLKS)
 extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 71926d630527..20e30f93b0c7 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -75,8 +75,10 @@ xfs_fs_set_xstate(
                flags |= XFS_GQUOTA_ACCT;
        if (uflags & FS_QUOTA_UDQ_ENFD)
                flags |= XFS_UQUOTA_ENFD;
-        if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
+        if (uflags & FS_QUOTA_GDQ_ENFD)
-                flags |= XFS_OQUOTA_ENFD;
+                flags |= XFS_GQUOTA_ENFD;
+        if (uflags & FS_QUOTA_PDQ_ENFD)
+                flags |= XFS_PQUOTA_ENFD;
        switch (op) {
        case Q_XQUOTAON:
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 2de58a85833c..78f9e70b80c7 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -618,6 +618,12 @@ xfs_sb_has_incompat_log_feature(
        return (sbp->sb_features_log_incompat & feature) != 0;
 }
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+        return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino);
+}
 /*
 * end of superblock version macros
 */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3033ba5e9762..1d68ffcdeaa7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -51,6 +51,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_icache.h"
 #include "xfs_trace.h"
+#include "xfs_icreate_item.h"
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -359,17 +360,17 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
                           !strcmp(this_char, MNTOPT_PRJQUOTA)) {
                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
-                                         XFS_OQUOTA_ENFD);
+                                         XFS_PQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+                        mp->m_qflags &= ~XFS_PQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
                           !strcmp(this_char, MNTOPT_GRPQUOTA)) {
                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
-                                         XFS_OQUOTA_ENFD);
+                                         XFS_GQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+                        mp->m_qflags &= ~XFS_GQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
                        xfs_warn(mp,
        "delaylog is the default now, option is deprecated.");
@@ -439,20 +440,15 @@ xfs_parseargs(
        }
 done:
-        if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+        if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) {
                /*
                 * At this point the superblock has not been read
                 * in, therefore we do not know the block size.
                 * Before the mount call ends we will convert
                 * these to FSBs.
                 */
-                if (dsunit) {
+                mp->m_dalign = dsunit;
-                        mp->m_dalign = dsunit;
+                mp->m_swidth = dswidth;
-                        mp->m_flags |= XFS_MOUNT_RETERR;
-                }
-                if (dswidth)
-                        mp->m_swidth = dswidth;
        }
        if (mp->m_logbufs != -1 &&
@@ -563,12 +559,12 @@ xfs_showargs(
        /* Either project or group quotas can be active, not both */
        if (mp->m_qflags & XFS_PQUOTA_ACCT) {
-                if (mp->m_qflags & XFS_OQUOTA_ENFD)
+                if (mp->m_qflags & XFS_PQUOTA_ENFD)
                        seq_puts(m, "," MNTOPT_PRJQUOTA);
                else
                        seq_puts(m, "," MNTOPT_PQUOTANOENF);
        } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
-                if (mp->m_qflags & XFS_OQUOTA_ENFD)
+                if (mp->m_qflags & XFS_GQUOTA_ENFD)
                        seq_puts(m, "," MNTOPT_GRPQUOTA);
                else
                        seq_puts(m, "," MNTOPT_GQUOTANOENF);
@@ -1136,8 +1132,8 @@ xfs_fs_statfs(
        spin_unlock(&mp->m_sb_lock);
        if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-            ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
+            ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
-                              (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+                              (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
                xfs_qm_statvfs(ip, statp);
        return 0;
 }
@@ -1481,6 +1477,10 @@ xfs_fs_fill_super(
        sb->s_time_gran = 1;
        set_posix_acl_flag(sb);
+        /* version 5 superblocks support inode version counters. */
+        if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+                sb->s_flags |= MS_I_VERSION;
        error = xfs_mountfs(mp);
        if (error)
                goto out_filestream_unmount;
@@ -1655,9 +1655,15 @@ xfs_init_zones(void)
                                        KM_ZONE_SPREAD, NULL);
        if (!xfs_ili_zone)
                goto out_destroy_inode_zone;
+        xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
+                                        "xfs_icr");
+        if (!xfs_icreate_zone)
+                goto out_destroy_ili_zone;
        return 0;
+ out_destroy_ili_zone:
+        kmem_zone_destroy(xfs_ili_zone);
 out_destroy_inode_zone:
        kmem_zone_destroy(xfs_inode_zone);
 out_destroy_efi_zone:
@@ -1696,6 +1702,7 @@ xfs_destroy_zones(void)
         * destroy caches.
         */
        rcu_barrier();
+        kmem_zone_destroy(xfs_icreate_zone);
        kmem_zone_destroy(xfs_ili_zone);
        kmem_zone_destroy(xfs_inode_zone);
        kmem_zone_destroy(xfs_efi_zone);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 195a403e1522..f4895b662fcb 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -358,7 +358,9 @@ xfs_symlink(
        int                     n;
        xfs_buf_t               *bp;
        prid_t                  prid;
-        struct xfs_dquot        *udqp, *gdqp;
+        struct xfs_dquot        *udqp = NULL;
+        struct xfs_dquot        *gdqp = NULL;
+        struct xfs_dquot        *pdqp = NULL;
        uint                    resblks;
        *ipp = NULL;
@@ -385,7 +387,7 @@ xfs_symlink(
         * Make sure that we have allocated dquot(s) on disk.
         */
        error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+                XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp);
        if (error)
                goto std_return;
@@ -426,7 +428,8 @@ xfs_symlink(
        /*
         * Reserve disk quota : blocks and inode.
         */
-        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
+        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+                                                pdqp, resblks, 1, 0);
        if (error)
                goto error_return;
@@ -464,7 +467,7 @@ xfs_symlink(
        /*
         * Also attach the dquot(s) to it, if applicable.
         */
-        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
+        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
        if (resblks)
                resblks -= XFS_IALLOC_SPACE_RES(mp);
@@ -562,6 +565,7 @@ xfs_symlink(
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
+        xfs_qm_dqrele(pdqp);
        *ipp = ip;
        return 0;
@@ -575,6 +579,7 @@ xfs_symlink(
        xfs_trans_cancel(tp, cancel_flags);
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
+        xfs_qm_dqrele(pdqp);
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -585,7 +590,7 @@ xfs_symlink(
 /*
 * Free a symlink that has blocks associated with it.
 */
-int
+STATIC int
 xfs_inactive_symlink_rmt(
        xfs_inode_t     *ip,
        xfs_trans_t     **tpp)
@@ -606,7 +611,7 @@ xfs_inactive_symlink_rmt(
        tp = *tpp;
        mp = ip->i_mount;
-        ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
+        ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
        /*
         * We're freeing a symlink that has some
         * blocks allocated to it.  Free the
@@ -720,3 +725,47 @@ xfs_inactive_symlink_rmt(
 error0:
        return error;
 }
+/*
+ * xfs_inactive_symlink - free a symlink
+ */
+int
+xfs_inactive_symlink(
+        struct xfs_inode        *ip,
+        struct xfs_trans        **tp)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        int                     pathlen;
+        trace_xfs_inactive_symlink(ip);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        /*
+         * Zero length symlinks _can_ exist.
+         */
+        pathlen = (int)ip->i_d.di_size;
+        if (!pathlen)
+                return 0;
+        if (pathlen < 0 || pathlen > MAXPATHLEN) {
+                xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
+                         __func__, (unsigned long long)ip->i_ino, pathlen);
+                ASSERT(0);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        if (ip->i_df.if_flags & XFS_IFINLINE) {
+                if (ip->i_df.if_bytes > 0)
+                        xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
+                                          XFS_DATA_FORK);
+                ASSERT(ip->i_df.if_bytes == 0);
+                return 0;
+        }
+        /* remove the remote symlink */
+        return xfs_inactive_symlink_rmt(ip, tp);
+}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index b39398d2097c..374394880c01 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -60,7 +60,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
                const char *target_path, umode_t mode, struct xfs_inode **ipp);
 int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_inactive_symlink_rmt(struct xfs_inode *ip, struct xfs_trans **tpp);
+int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
 #endif /* __KERNEL__ */
 #endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 2801b5ce6cdb..1743b9f8e23d 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -25,11 +25,11 @@ static struct ctl_table_header *xfs_table_header;
 #ifdef CONFIG_PROC_FS
 STATIC int
 xfs_stats_clear_proc_handler(
-        ctl_table       *ctl,
+        struct ctl_table        *ctl,
-        int             write,
+        int                     write,
-        void            __user *buffer,
+        void                    __user *buffer,
-        size_t          *lenp,
+        size_t                  *lenp,
-        loff_t          *ppos)
+        loff_t                  *ppos)
 {
        int             c, ret, *valp = ctl->data;
        __uint32_t      vn_active;
@@ -55,11 +55,11 @@ xfs_stats_clear_proc_handler(
 STATIC int
 xfs_panic_mask_proc_handler(
-        ctl_table       *ctl,
+        struct ctl_table        *ctl,
-        int             write,
+        int                     write,
-        void            __user *buffer,
+        void                    __user *buffer,
-        size_t          *lenp,
+        size_t                  *lenp,
-        loff_t          *ppos)
+        loff_t                  *ppos)
 {
        int             ret, *valp = ctl->data;
@@ -74,7 +74,7 @@ xfs_panic_mask_proc_handler(
 }
 #endif /* CONFIG_PROC_FS */
-static ctl_table xfs_table[] = {
+static struct ctl_table xfs_table[] = {
        {
                .procname       = "irix_sgid_inherit",
                .data           = &xfs_params.sgid_inherit.val,
@@ -227,7 +227,7 @@ static ctl_table xfs_table[] = {
        {}
 };
-static ctl_table xfs_dir_table[] = {
+static struct ctl_table xfs_dir_table[] = {
        {
                .procname       = "xfs",
                .mode           = 0555,
@@ -236,7 +236,7 @@ static ctl_table xfs_dir_table[] = {
        {}
 };
-static ctl_table xfs_root_table[] = {
+static struct ctl_table xfs_root_table[] = {
        {
                .procname       = "fs",
                .mode           = 0555,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index aa4db3307d36..47910e638c18 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -486,9 +486,12 @@ DEFINE_EVENT(xfs_buf_item_class, name, \
        TP_PROTO(struct xfs_buf_log_item *bip), \
        TP_ARGS(bip))
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
@@ -508,6 +511,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
 DECLARE_EVENT_CLASS(xfs_lock_class,
        TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
@@ -571,6 +575,7 @@ DEFINE_INODE_EVENT(xfs_iget_miss);
 DEFINE_INODE_EVENT(xfs_getattr);
 DEFINE_INODE_EVENT(xfs_setattr);
 DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_inactive_symlink);
 DEFINE_INODE_EVENT(xfs_alloc_file_space);
 DEFINE_INODE_EVENT(xfs_free_file_space);
 DEFINE_INODE_EVENT(xfs_readdir);
@@ -974,14 +979,16 @@ DEFINE_RW_EVENT(xfs_file_splice_read);
 DEFINE_RW_EVENT(xfs_file_splice_write);
 DECLARE_EVENT_CLASS(xfs_page_class,
-        TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
+        TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
-        TP_ARGS(inode, page, off),
+                 unsigned int len),
+        TP_ARGS(inode, page, off, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(pgoff_t, pgoff)
                __field(loff_t, size)
                __field(unsigned long, offset)
+                __field(unsigned int, length)
                __field(int, delalloc)
                __field(int, unwritten)
        ),
@@ -995,24 +1002,27 @@ DECLARE_EVENT_CLASS(xfs_page_class,
                __entry->pgoff = page_offset(page);
                __entry->size = i_size_read(inode);
                __entry->offset = off;
+                __entry->length = len;
                __entry->delalloc = delalloc;
                __entry->unwritten = unwritten;
        ),
        TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-                  "delalloc %d unwritten %d",
+                  "length %x delalloc %d unwritten %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->pgoff,
                  __entry->size,
                  __entry->offset,
+                  __entry->length,
                  __entry->delalloc,
                  __entry->unwritten)
 )
 #define DEFINE_PAGE_EVENT(name)         \
 DEFINE_EVENT(xfs_page_class, name,      \
-        TP_PROTO(struct inode *inode, struct page *page, unsigned long off),    \
+        TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
-        TP_ARGS(inode, page, off))
+                 unsigned int len),     \
+        TP_ARGS(inode, page, off, len))
 DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 2fd7c1ff1d21..35a229981354 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -234,71 +234,93 @@ xfs_calc_remove_reservation(
 }
 /*
- * For symlink we can modify:
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+/*
+ * For create we can modify:
 *    the parent directory inode: inode size
 *    the new inode: inode size
- *    the inode btree entry: 1 block
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
 *    the directory btree: (max depth + v2) * dir block size
 *    the directory inode's bmap btree: (max depth + v2) * block size
- *    the blocks for the symlink: 1 kB
+ */
- * Or in the first xact we allocate some inodes giving:
+STATIC uint
+xfs_calc_create_resv_modify(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+                xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+                (uint)XFS_FSB_TO_B(mp, 1) +
+                xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+}
+/*
+ * For create we can allocate some inodes giving:
 *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
 *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
 *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
 */
 STATIC uint
-xfs_calc_symlink_reservation(
+xfs_calc_create_resv_alloc(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+                mp->m_sb.sb_sectsize +
+                xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                 XFS_FSB_TO_B(mp, 1));
+}
+STATIC uint
+__xfs_calc_create_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-                MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+                MAX(xfs_calc_create_resv_alloc(mp),
-                     xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
+                    xfs_calc_create_resv_modify(mp));
-                     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-                                      XFS_FSB_TO_B(mp, 1)) +
-                     xfs_calc_buf_res(1, 1024)),
-                    (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-                     xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
-                                      XFS_FSB_TO_B(mp, 1)) +
-                     xfs_calc_buf_res(mp->m_in_maxlevels,
-                                      XFS_FSB_TO_B(mp, 1)) +
-                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                      XFS_FSB_TO_B(mp, 1))));
 }
 /*
- * For create we can modify:
+ * For icreate we can allocate some inodes giving:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: block size
- *    the superblock for the nlink flag: sector size
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
 *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
 *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
 *    the inode btree: max depth * blocksize
 *    the allocation btrees: 2 trees * (max depth - 1) * block size
 */
 STATIC uint
-xfs_calc_create_reservation(
+xfs_calc_icreate_resv_alloc(
        struct xfs_mount        *mp)
 {
+        return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+                mp->m_sb.sb_sectsize +
+                xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                 XFS_FSB_TO_B(mp, 1));
+}
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
        return XFS_DQUOT_LOGRES(mp) +
-                MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+                MAX(xfs_calc_icreate_resv_alloc(mp),
-                     xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+                    xfs_calc_create_resv_modify(mp));
-                     (uint)XFS_FSB_TO_B(mp, 1) +
+}
-                     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-                                      XFS_FSB_TO_B(mp, 1))),
+STATIC uint
-                    (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+xfs_calc_create_reservation(
-                     mp->m_sb.sb_sectsize +
+        struct xfs_mount        *mp)
-                     xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
+{
-                                      XFS_FSB_TO_B(mp, 1)) +
+        if (xfs_sb_version_hascrc(&mp->m_sb))
-                     xfs_calc_buf_res(mp->m_in_maxlevels,
+                return xfs_calc_icreate_reservation(mp);
-                                      XFS_FSB_TO_B(mp, 1)) +
+        return __xfs_calc_create_reservation(mp);
-                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                      XFS_FSB_TO_B(mp, 1))));
 }
 /*
@@ -311,6 +333,20 @@ xfs_calc_mkdir_reservation(
        return xfs_calc_create_reservation(mp);
 }
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (MAXPATHLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_create_reservation(mp) +
+               xfs_calc_buf_res(1, MAXPATHLEN);
+}
 /*
 * In freeing an inode we can modify:
 *    the inode being freed: inode size
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a44dba5b2cdb..2b4946393e30 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -48,6 +48,7 @@ typedef struct xfs_trans_header {
 #define XFS_LI_BUF              0x123c  /* v2 bufs, variable sized inode bufs */
 #define XFS_LI_DQUOT            0x123d
 #define XFS_LI_QUOTAOFF         0x123e
+#define XFS_LI_ICREATE          0x123f
 #define XFS_LI_TYPE_DESC \
        { XFS_LI_EFI,           "XFS_LI_EFI" }, \
@@ -107,7 +108,8 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_SWAPEXT               40
 #define XFS_TRANS_SB_COUNT              41
 #define XFS_TRANS_CHECKPOINT            42
-#define XFS_TRANS_TYPE_MAX              42
+#define XFS_TRANS_ICREATE               43
+#define XFS_TRANS_TYPE_MAX              43
 /* new transaction types need to be reflected in xfs_logprint(8) */
 #define XFS_TRANS_TYPES \
@@ -210,23 +212,18 @@ struct xfs_log_item_desc {
 /*
 * Per-extent log reservation for the allocation btree changes
 * involved in freeing or allocating an extent.
- * 2 trees * (2 blocks/level * max depth - 1) * block size
+ * 2 trees * (2 blocks/level * max depth - 1)
 */
-#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
-        ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
 #define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
        ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
 /*
 * Per-directory log reservation for any directory change.
- * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * dir blocks: (1 btree block per level + data block + free block)
- * bmap btree: (levels + 2) * max depth * block size
+ * bmap btree: (levels + 2) * max depth
 * v2 directory blocks can be fragmented below the dirblksize down to the fsb
 * size, so account for that in the DAENTER macros.
 */
-#define XFS_DIROP_LOG_RES(mp)   \
-        (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
-         (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
 #define XFS_DIROP_LOG_COUNT(mp) \
        (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
         XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
@@ -503,6 +500,7 @@ void		xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void            xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void            xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 73a5fa457e16..aa5a04b844d6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -397,7 +397,6 @@ shutdown_abort:
        return XFS_ERROR(EIO);
 }
 /*
 * Release the buffer bp which was previously acquired with one of the
 * xfs_trans_... buffer allocation routines if the buffer has not
@@ -603,8 +602,14 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
        tp->t_flags |= XFS_TRANS_DIRTY;
        bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-        bip->bli_flags |= XFS_BLI_LOGGED;
-        xfs_buf_item_log(bip, first, last);
+        /*
+         * If we have an ordered buffer we are not logging any dirty range but
+         * it still needs to be marked dirty and that it has been logged.
+         */
+        bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+        if (!(bip->bli_flags & XFS_BLI_ORDERED))
+                xfs_buf_item_log(bip, first, last);
 }
@@ -757,6 +762,29 @@ xfs_trans_inode_alloc_buf(
 }
 /*
+ * Mark the buffer as ordered for this transaction. This means
+ * that the contents of the buffer are not recorded in the transaction
+ * but it is tracked in the AIL as though it was. This allows us
+ * to record logical changes in transactions rather than the physical
+ * changes we make to the buffer without changing writeback ordering
+ * constraints of metadata buffers.
+ */
+void
+xfs_trans_ordered_buf(
+        struct xfs_trans        *tp,
+        struct xfs_buf          *bp)
+{
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        ASSERT(bp->b_transp == tp);
+        ASSERT(bip != NULL);
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        bip->bli_flags |= XFS_BLI_ORDERED;
+        trace_xfs_buf_item_ordered(bip);
+}
+/*
 * Set the type of the buffer for log recovery so that it can correctly identify
 * and hence attach the correct buffer ops to the buffer after replay.
 */
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index fec75d023703..61407a847b86 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -103,8 +103,6 @@ xfs_trans_dup_dqinfo(
                return;
        xfs_trans_alloc_dqinfo(ntp);
-        oqa = otp->t_dqinfo->dqa_usrdquots;
-        nqa = ntp->t_dqinfo->dqa_usrdquots;
        /*
         * Because the quota blk reservation is carried forward,
@@ -113,7 +111,9 @@ xfs_trans_dup_dqinfo(
        if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
                ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
-        for (j = 0; j < 2; j++) {
+        for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+                oqa = otp->t_dqinfo->dqs[j];
+                nqa = ntp->t_dqinfo->dqs[j];
                for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
                        if (oqa[i].qt_dquot == NULL)
                                break;
@@ -138,8 +138,6 @@ xfs_trans_dup_dqinfo(
                        oq->qt_ino_res = oq->qt_ino_res_used;
                }
-                oqa = otp->t_dqinfo->dqa_grpdquots;
-                nqa = ntp->t_dqinfo->dqa_grpdquots;
        }
 }
@@ -157,8 +155,7 @@ xfs_trans_mod_dquot_byino(
        if (!XFS_IS_QUOTA_RUNNING(mp) ||
            !XFS_IS_QUOTA_ON(mp) ||
-            ip->i_ino == mp->m_sb.sb_uquotino ||
+            xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
-            ip->i_ino == mp->m_sb.sb_gquotino)
                return;
        if (tp->t_dqinfo == NULL)
@@ -166,20 +163,28 @@ xfs_trans_mod_dquot_byino(
        if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
                (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
-        if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot)
+        if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
                (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+        if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot)
+                (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta);
 }
-STATIC xfs_dqtrx_t *
+STATIC struct xfs_dqtrx *
 xfs_trans_get_dqtrx(
-        xfs_trans_t     *tp,
+        struct xfs_trans        *tp,
-        xfs_dquot_t     *dqp)
+        struct xfs_dquot        *dqp)
 {
-        int             i;
+        int                     i;
-        xfs_dqtrx_t     *qa;
+        struct xfs_dqtrx        *qa;
-        qa = XFS_QM_ISUDQ(dqp) ?
+        if (XFS_QM_ISUDQ(dqp))
-                tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+                qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
+        else if (XFS_QM_ISGDQ(dqp))
+                qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
+        else if (XFS_QM_ISPDQ(dqp))
+                qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ];
+        else
+                return NULL;
        for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
                if (qa[i].qt_dquot == NULL ||
@@ -292,11 +297,10 @@ xfs_trans_mod_dquot(
 /*
- * Given an array of dqtrx structures, lock all the dquots associated
+ * Given an array of dqtrx structures, lock all the dquots associated and join
- * and join them to the transaction, provided they have been modified.
+ * them to the transaction, provided they have been modified.  We know that the
- * We know that the highest number of dquots (of one type - usr OR grp),
+ * highest number of dquots of one type - usr, grp OR prj - involved in a
- * involved in a transaction is 2 and that both usr and grp combined - 3.
+ * transaction is 2 so we don't need to make this very generic.
- * So, we don't attempt to make this very generic.
 */
 STATIC void
 xfs_trans_dqlockedjoin(
@@ -339,12 +343,10 @@ xfs_trans_apply_dquot_deltas(
                return;
        ASSERT(tp->t_dqinfo);
-        qa = tp->t_dqinfo->dqa_usrdquots;
+        for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
-        for (j = 0; j < 2; j++) {
+                qa = tp->t_dqinfo->dqs[j];
-                if (qa[0].qt_dquot == NULL) {
+                if (qa[0].qt_dquot == NULL)
-                        qa = tp->t_dqinfo->dqa_grpdquots;
                        continue;
-                }
                /*
                 * Lock all of the dquots and join them to the transaction.
@@ -495,10 +497,6 @@ xfs_trans_apply_dquot_deltas(
                        ASSERT(dqp->q_res_rtbcount >=
                                be64_to_cpu(dqp->q_core.d_rtbcount));
                }
-                /*
-                 * Do the group quotas next
-                 */
-                qa = tp->t_dqinfo->dqa_grpdquots;
        }
 }
@@ -521,9 +519,9 @@ xfs_trans_unreserve_and_mod_dquots(
        if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
                return;
-        qa = tp->t_dqinfo->dqa_usrdquots;
+        for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+                qa = tp->t_dqinfo->dqs[j];
-        for (j = 0; j < 2; j++) {
                for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
                        qtrx = &qa[i];
                        /*
@@ -565,7 +563,6 @@ xfs_trans_unreserve_and_mod_dquots(
                                xfs_dqunlock(dqp);
                }
-                qa = tp->t_dqinfo->dqa_grpdquots;
        }
 }
@@ -640,8 +637,8 @@ xfs_trans_dqresv(
        if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
            dqp->q_core.d_id &&
            ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
-             (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
+             (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
-              (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
+             (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
                if (nblks > 0) {
                        /*
                         * dquot is locked already. See if we'd go over the
@@ -736,8 +733,8 @@ error_return:
 /*
 * Given dquot(s), make disk block and/or inode reservations against them.
- * The fact that this does the reservation against both the usr and
+ * The fact that this does the reservation against user, group and
- * grp/prj quotas is important, because this follows a both-or-nothing
+ * project quotas is important, because this follows a all-or-nothing
 * approach.
 *
 * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
@@ -748,15 +745,16 @@ error_return:
 */
 int
 xfs_trans_reserve_quota_bydquots(
-        xfs_trans_t     *tp,
+        struct xfs_trans        *tp,
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        xfs_dquot_t     *udqp,
+        struct xfs_dquot        *udqp,
-        xfs_dquot_t     *gdqp,
+        struct xfs_dquot        *gdqp,
-        long            nblks,
+        struct xfs_dquot        *pdqp,
-        long            ninos,
+        long                    nblks,
-        uint            flags)
+        long                    ninos,
+        uint                    flags)
 {
-        int             resvd = 0, error;
+        int             error;
        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
@@ -771,28 +769,34 @@ xfs_trans_reserve_quota_bydquots(
                                        (flags & ~XFS_QMOPT_ENOSPC));
                if (error)
                        return error;
-                resvd = 1;
        }
        if (gdqp) {
                error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
-                if (error) {
+                if (error)
-                        /*
+                        goto unwind_usr;
-                         * can't do it, so backout previous reservation
+        }
-                         */
-                        if (resvd) {
+        if (pdqp) {
-                                flags |= XFS_QMOPT_FORCE_RES;
+                error = xfs_trans_dqresv(tp, mp, pdqp, nblks, ninos, flags);
-                                xfs_trans_dqresv(tp, mp, udqp,
+                if (error)
-                                                 -nblks, -ninos, flags);
+                        goto unwind_grp;
-                        }
-                        return error;
-                }
        }
        /*
         * Didn't change anything critical, so, no need to log
         */
        return 0;
+unwind_grp:
+        flags |= XFS_QMOPT_FORCE_RES;
+        if (gdqp)
+                xfs_trans_dqresv(tp, mp, gdqp, -nblks, -ninos, flags);
+unwind_usr:
+        flags |= XFS_QMOPT_FORCE_RES;
+        if (udqp)
+                xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags);
+        return error;
 }
@@ -816,8 +820,7 @@ xfs_trans_reserve_quota_nblks(
        if (XFS_IS_PQUOTA_ON(mp))
                flags |= XFS_QMOPT_ENOSPC;
-        ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
+        ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
-        ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
@@ -830,6 +833,7 @@ xfs_trans_reserve_quota_nblks(
         */
        return xfs_trans_reserve_quota_bydquots(tp, mp,
                                                ip->i_udquot, ip->i_gdquot,
+                                                ip->i_pdquot,
                                                nblks, ninos, flags);
 }
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ac6d567704db..53dfe46f3680 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -112,6 +112,17 @@ xfs_trans_log_inode(
        ASSERT(ip->i_itemp != NULL);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        /*
+         * First time we log the inode in a transaction, bump the inode change
+         * counter if it is configured for this to occur.
+         */
+        if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
+            IS_I_VERSION(VFS_I(ip))) {
+                inode_inc_iversion(VFS_I(ip));
+                ip->i_d.di_changecount = VFS_I(ip)->i_version;
+                flags |= XFS_ILOG_CORE;
+        }
        tp->t_flags |= XFS_TRANS_DIRTY;
        ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0176bb21f09a..dc730ac272be 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -322,18 +322,9 @@ xfs_inactive(
        xfs_trans_ijoin(tp, ip, 0);
        if (S_ISLNK(ip->i_d.di_mode)) {
-                /*
+                error = xfs_inactive_symlink(ip, &tp);
-                 * Zero length symlinks _can_ exist.
+                if (error)
-                 */
+                        goto out_cancel;
-                if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) {
-                        error = xfs_inactive_symlink_rmt(ip, &tp);
-                        if (error)
-                                goto out_cancel;
-                } else if (ip->i_df.if_bytes > 0) {
-                        xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
-                                          XFS_DATA_FORK);
-                        ASSERT(ip->i_df.if_bytes == 0);
-                }
        } else if (truncate) {
                ip->i_d.di_size = 0;
                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -498,6 +489,7 @@ xfs_create(
        prid_t                  prid;
        struct xfs_dquot        *udqp = NULL;
        struct xfs_dquot        *gdqp = NULL;
+        struct xfs_dquot        *pdqp = NULL;
        uint                    resblks;
        uint                    log_res;
        uint                    log_count;
@@ -516,7 +508,8 @@ xfs_create(
         * Make sure that we have allocated dquot(s) on disk.
         */
        error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+                                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+                                        &udqp, &gdqp, &pdqp);
        if (error)
                return error;
@@ -568,7 +561,8 @@ xfs_create(
        /*
         * Reserve disk quota and the inode.
         */
-        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
+        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+                                                pdqp, resblks, 1, 0);
        if (error)
                goto out_trans_cancel;
@@ -632,7 +626,7 @@ xfs_create(
         * These ids of the inode couldn't have changed since the new
         * inode has been locked ever since it was created.
         */
-        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
+        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
        error = xfs_bmap_finish(&tp, &free_list, &committed);
        if (error)
@@ -644,6 +638,7 @@ xfs_create(
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
+        xfs_qm_dqrele(pdqp);
        *ipp = ip;
        return 0;
@@ -665,6 +660,7 @@ xfs_create(
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
+        xfs_qm_dqrele(pdqp);
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -1577,7 +1573,7 @@ xfs_free_file_space(
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                error = xfs_trans_reserve_quota(tp, mp,
-                                ip->i_udquot, ip->i_gdquot,
+                                ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
                                resblks, 0, XFS_QMOPT_RES_REGBLKS);
                if (error)
                        goto error1;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 5163022d9808..38c67c34d73f 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -31,8 +31,7 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
                struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
                struct xfs_name *target_name);
-int xfs_readdir(struct xfs_inode        *dp, void *dirent, size_t bufsize,
+int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize);
-                       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
                const char *target_path, umode_t mode, struct xfs_inode **ipp);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
author	Sage Weil <sage@inktank.com>	2013-08-15 14:11:45 -0400
committer	Sage Weil <sage@inktank.com>	2013-08-15 14:11:45 -0400
commit	ee3e542fec6e69bc9fb668698889a37d93950ddf (patch)
tree	e74ee766a4764769ef1d3d45d266b4dea64101d3 /fs
parent	fe2a801b50c0bb8039d627e5ae1fec249d10ff39 (diff)
parent	f1d6e17f540af37bb1891480143669ba7636c4cf (diff)